<a href="https://colab.research.google.com/github/Molter73/proyecto-computacion-1/blob/mauro%2Fnotebooks/training/Evaluaci%C3%B3n_de_modelos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gdown==v4.6.3

![ ! -d /content/SemEval2024-Task8 ] && gdown --folder https://drive.google.com/drive/folders/14DulzxuH5TDhXtviRVXsH5e2JTY2POLi

In [None]:
!pip install scikit-optimize

# New Section

In [None]:
# import required libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# read dataset function
def read_dataset(inFile):
    print("\nReading:", inFile)
    data =  pd.read_json(inFile, lines=True)
    return data

In [None]:
# data paths and config
inTrain = '/content/SemEval2024-Task8/SubtaskA/subtaskA_train_monolingual.jsonl'
inTest = '/content/SemEval2024-Task8/SubtaskA/subtaskA_dev_monolingual.jsonl'
inDatasetTest = '/content/mount/dataset.jsonl'

max_instances_per_class = 20000
max_features = 5000 # maximum number of features extracted for our instances
random_seed = 777 # set random seed for reproducibility
id2label = {0: "human", 1: "machine"}

In [None]:
# read dataset
train_df = read_dataset(inTrain)
test_df = read_dataset(inTest)
dataset_df = read_dataset(inDatasetTest)

In [None]:
#Establecemos el número de instancias presentes
instancias_humanas = len(train_df[train_df['label'] == 0])
instancias_ia =  len(train_df[train_df['label'] == 1])
instancias_dataset = len(train_df)

In [None]:
#Sumamos las instancias y realizamos la longitud media
suma_longitudes_humanos = train_df[train_df['label'] == 0]['text'].apply(len).sum()
longitud_media_humanos = suma_longitudes_humanos / instancias_humanas

suma_longitudes_generados = train_df[train_df['label'] == 1]['text'].apply(len).sum()
longitud_media_generado = suma_longitudes_generados / instancias_ia

In [None]:
#Imprimimos la Tabla de Estadísticas
print('Número de instancias en el dataset:\t\t\t\t', instancias_dataset)
print('Número de instancias humanas:\t\t\t\t\t', instancias_humanas)
print('Número de instancias generadas:\t\t\t\t\t', instancias_ia)
print('Longitud media en caracteres de las instancias humanas:\t\t', longitud_media_humanos)
print('Longitud media en caracteres de las instancias generadas:\t', longitud_media_generado)

In [None]:
def get_text_lengths(df):
  #Establecemos el número de instancias presentes
  instancias_humanas = len(df[df['label'] == 0])
  instancias_ia =  len(df[df['label'] == 1])
  instancias_dataset = len(df)

  #Sumamos las instancias y realizamos la longitud media
  suma_longitudes_humanos = df[df['label'] == 0]['text'].apply(len).sum()
  longitud_media_humanos = suma_longitudes_humanos / instancias_humanas

  suma_longitudes_generados = df[df['label'] == 1]['text'].apply(len).sum()
  longitud_media_generado = suma_longitudes_generados / instancias_ia

  return (longitud_media_humanos, longitud_media_generado)

In [None]:
from itertools import count

def balance_text_lengths(df):
  starting_rows = len(df.index)
  # downsample training data to train faster
  balanced_df = df.groupby("label").sample(n=max_instances_per_class, random_state=random_seed)
  human_length, machine_length = get_text_lengths(balanced_df)

  while human_length > machine_length * 1.05:
    cutoff = (df[df['label'] == 0]['text'].apply(len).max() + machine_length) / 2
    df = df.drop(df[(df['text'].map(len) > cutoff) & (df['label'] == 0)].index)

    if starting_rows == len(df.index):
      print("No more rows to remove")
      return balanced_df

    human_length, machine_length = get_text_lengths(balanced_df)
    balanced_df = df.groupby("label").sample(n=max_instances_per_class, random_state=random_seed)
    human_length, machine_length = get_text_lengths(balanced_df)

  return balanced_df

In [None]:
train_df = balance_text_lengths(train_df)

In [None]:
#Establecemos el número de instancias presentes
instancias_humanas = len(train_df[train_df['label'] == 0])
instancias_ia =  len(train_df[train_df['label'] == 1])
instancias_dataset = len(train_df)

In [None]:
#Sumamos las instancias y realizamos la longitud media
suma_longitudes_humanos = train_df[train_df['label'] == 0]['text'].apply(len).sum()
longitud_media_humanos = suma_longitudes_humanos / instancias_humanas

suma_longitudes_generados = train_df[train_df['label'] == 1]['text'].apply(len).sum()
longitud_media_generado = suma_longitudes_generados / instancias_ia

In [None]:
#Imprimimos la Tabla de Estadísticas
print('Número de instancias en el dataset:\t\t\t\t', instancias_dataset)
print('Número de instancias humanas:\t\t\t\t\t', instancias_humanas)
print('Número de instancias generadas:\t\t\t\t\t', instancias_ia)
print('Longitud media en caracteres de las instancias humanas:\t\t', longitud_media_humanos)
print('Longitud media en caracteres de las instancias generadas:\t', longitud_media_generado)

In [None]:
# vectorize data: extract features from our data (from text to numeric vectors)
vectorizer = TfidfVectorizer(max_features=max_features, stop_words="english", ngram_range=(1,1))
X_train = vectorizer.fit_transform(train_df["text"])
X_test = vectorizer.transform(test_df["text"])
X_dataset = vectorizer.transform(dataset_df["text"])

# print({k: v for k, v in sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1], reverse=True)})

In [None]:
# vectorize labels : from text to numeric vectors
le = LabelEncoder()
Y_train = le.fit_transform(train_df["label"])
Y_test = le.transform(test_df["label"])
Y_dataset = le.transform(dataset_df["label"])

In [None]:
#Imprimimos Tabla de Estadísticas
print('Número de instancias en el training:\t\t',len(train_df))
print('Número de instancias en el test:\t\t',len(test_df))
print('Número de instancias en el dataset:\t\t',len(dataset_df))
print('Número de instancias humanas en el training:\t',len(train_df[train_df['label'] == 0]))
print('Número de instancias generadas en el training:\t',len(train_df[train_df['label'] == 1]))
print('Número de instancias humanas en el test:\t',len(test_df[test_df['label'] == 0]))
print('Número de instancias generadas en el test:\t',len(test_df[test_df['label'] == 1]))
print('Número de instancias humanas en el dataset:\t',len(dataset_df[dataset_df['label'] == 0]))
print('Número de instancias generadas en el dataset:\t',len(dataset_df[dataset_df['label'] == 1]))

In [None]:
from sklearn.utils import all_estimators
from sklearn.base import ClassifierMixin
from sklearn.metrics import f1_score

from heapq import heappush, nlargest
import multiprocessing

best_score = float('-inf')
best_model = None
best_dataset_score = float('-inf')
best_dataset_model = None

test_heap = []
dataset_heap = []
priority = 0

from joblib import parallel_backend

with parallel_backend('threading', n_jobs=multiprocessing.cpu_count()):

  for name, ClassifierClass in all_estimators(type_filter='classifier'):
        if issubclass(ClassifierClass, ClassifierMixin) and hasattr(ClassifierClass, 'fit'):
          try:
              regressor = ClassifierClass()
              regressor.fit(X_train, Y_train)

              y_pred = regressor.predict(X_test)
              score = f1_score(Y_test, y_pred, average="macro")
              if score > best_score:
                  best_score = score
                  best_model = regressor
              print(f"Model: {name} Macro F1: {score}")
              print(classification_report(Y_test, y_pred))
              heappush(test_heap, (score, priority, regressor))

              y_pred = regressor.predict(X_dataset)
              score = f1_score(Y_dataset, y_pred, average="macro")
              if score > best_dataset_score:
                  best_dataset_score = score
                  best_dataset_model = regressor
              print(f"Model: {name} Dataset Macro F1: {score}")
              print(classification_report(Y_dataset, y_pred))
              heappush(dataset_heap, (score, priority, regressor))
              priority += 1

          except Exception as e:
            continue

In [None]:
print(f"Best models for test:")
for score, _, model in nlargest(5, test_heap):
  print(f"{model.__class__.__name__}, score: {score}")

In [None]:
print(f"Best models for dataset:")
for score, _, model in nlargest(5, dataset_heap):
  print(f"{model.__class__.__name__}, score: {score}")