In [None]:
!pip install gdown==v4.6.3

![ ! -d /content/SemEval2024-Task8 ] && gdown --folder https://drive.google.com/drive/folders/14DulzxuH5TDhXtviRVXsH5e2JTY2POLi

In [None]:
!pip install scikit-optimize

In [None]:
import multiprocessing
import os

import pandas as pd

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV

from joblib import parallel_backend

import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

In [None]:
# read dataset function
def read_dataset(inFile):
    print("\nReading:", inFile)
    data =  pd.read_json(inFile, lines=True)
    return data

In [None]:
def vectorize(vectorizer, x_train, x_test, x_dataset):
  return (
      vectorizer.fit_transform(x_train),
      vectorizer.transform(x_test),
      vectorizer.transform(x_dataset),
  )

In [None]:
def evaluate(model, X, y, x_test, y_test, x_dataset, y_dataset, name=None):
  if name is None:
    name = model.__class__.__name__

  model.fit(X, y)

  y_pred = model.predict(x_test)
  score = f1_score(y_test, y_pred, average="macro")
  print(f"Macro {name} F1: {score}")
  print(classification_report(y_test, y_pred))

  test_report = classification_report(y_test, y_pred, output_dict=True)

  y_pred = model.predict(x_dataset)
  score = f1_score(y_dataset, y_pred, average="macro")
  print(f"Macro {name} dataset F1: {score}")
  print(classification_report(y_dataset, y_pred))

  if hasattr(model, 'best_params_'):
    print(f'Mejores parámetros: {model.best_params_}')

  return {
      'test': test_report,
      'dataset': classification_report(y_dataset, y_pred, output_dict=True),
      'best_params': model.best_params_ if hasattr(model, 'best_params_') else None,
  }

In [None]:
def dataset_stats(df, id2label):
  #Establecemos el número de instancias presentes
  instancias_humanas = len(df[df['label'] == 0])
  instancias_ia =  len(df[df['label'] == 1])
  instancias_dataset = len(df)

  #Sumamos las instancias y realizamos la longitud media
  longitudes_medias = []
  for label in id2label:
    acc = df[df['model'] == label]['text'].apply(len).sum()
    length = len(df[df['model'] == label])
    if length != 0:
      longitudes_medias.append(acc / length)
    else:
      longitudes_medias.append(0)

  #Imprimimos la Tabla de Estadísticas
  print('Número de instancias en el dataset:\t\t\t\t', instancias_dataset)

  for i in range(len(id2label)):
    label = id2label[i]
    print(f'Número de instancias {label}:\t\t\t\t\t', len(df[df['model'] == label]))

  for i in range(len(id2label)):
    label = id2label[i]
    print(f'Longitud media en caracteres de las instancias {label}:\t\t', longitudes_medias[i])

In [None]:
# data paths and config
inTrain = '/content/SemEval2024-Task8/SubtaskB/subtaskB_train.jsonl'
inTest = '/content/SemEval2024-Task8/SubtaskB/subtaskB_dev.jsonl'
inDatasetTest = '/content/mount/dataset.jsonl'

max_instances_per_class = 2000
max_features = 20000 # maximum number of features extracted for our instances
random_seed = 777 # set random seed for reproducibility

results = {
}

In [None]:
# Modelos a evaluar
models = [
  ExtraTreesClassifier(),
  GradientBoostingClassifier(),
  RandomForestClassifier(),
  AdaBoostClassifier(),
  BernoulliNB(),
]

In [None]:
train_df = read_dataset(inTrain)
test_df = read_dataset(inTest)
dataset_df = read_dataset(inDatasetTest)

In [None]:
# vectorize labels : from text to numeric vectors
le = LabelEncoder()
Y_train = le.fit_transform(train_df["model"])

In [None]:
dataset_stats(train_df, le.classes_)

In [None]:
train_df = train_df.groupby("model").sample(n=max_instances_per_class, random_state=random_seed)

In [None]:
# vectorize labels : from text to numeric vectors
le = LabelEncoder()
Y_train = le.fit_transform(train_df["model"])
Y_test = le.transform(test_df["model"])
Y_dataset = le.transform(dataset_df["model"])

In [None]:
dataset_stats(train_df, le.classes_)

In [None]:
dataset_stats(test_df, le.classes_)

In [None]:
dataset_stats(dataset_df, le.classes_)

In [None]:
# vectorize data: extract features from our data (from text to numeric vectors)
vectorizer = TfidfVectorizer(max_features=max_features)
X_train, X_test, X_dataset = vectorize(vectorizer, train_df["text"], test_df["text"], dataset_df["text"])

In [None]:
with parallel_backend('threading', n_jobs=multiprocessing.cpu_count()):
  for model in models:
    results[model.__class__.__name__] = {}
    results[model.__class__.__name__]['baseline'] = evaluate(model, X_train, Y_train, X_test, Y_test, X_dataset, Y_dataset)

In [None]:
# vectorize data: extract features from our data (from text to numeric vectors)
vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
X_train_stopwords, X_test_stopwords, X_dataset_stopwords = vectorize(vectorizer, train_df["text"], test_df["text"], dataset_df["text"])

In [None]:
with parallel_backend('threading', n_jobs=multiprocessing.cpu_count()):
  for model in models:
    results[model.__class__.__name__]['stopwords'] = evaluate(model, X_train_stopwords, Y_train, X_test_stopwords, Y_test, X_dataset_stopwords, Y_dataset)

In [None]:
# vectorize data: extract features from our data (from text to numeric vectors)
vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1,3))
X_train_ngrams, X_test_ngrams, X_dataset_ngrams = vectorize(vectorizer, train_df["text"], test_df["text"], dataset_df["text"])

In [None]:
with parallel_backend('threading', n_jobs=multiprocessing.cpu_count()):
  for model in models:
    results[model.__class__.__name__]['ngrams'] = evaluate(model, X_train_ngrams, Y_train, X_test_ngrams, Y_test, X_dataset_ngrams, Y_dataset)

In [None]:
params = {
  'ExtraTreesClassifier': {
    'n_estimators': [100, 150, 200],
    'criterion': ['gini', 'entropy', 'log_loss'],
  },
  'GradientBoostingClassifier': {
      'loss': ['log_loss', 'exponential'],
      'criterion': ['friedman_mse', 'squared_error'],
  },
  'RandomForestClassifier': {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'min_samples_split': [2, 5, 10],
  },
  'AdaBoostClassifier': {
      'estimator': [None, ExtraTreesClassifier(), ExtraTreesClassifier(criterion='log_loss', n_estimators=150)],
  },
  'BernoulliNB': {
      'alpha': [1.0, 0.01, 10.0],
  }
}

In [None]:
# vectorize data: extract features from our data (from text to numeric vectors)
vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1,3))
X_train_cv, X_test_cv, X_dataset_cv = vectorize(vectorizer, train_df["text"], test_df["text"], dataset_df["text"])

In [None]:
with parallel_backend('threading', n_jobs=multiprocessing.cpu_count()):
  for model in models:
    name = model.__class__.__name__
    results[name]['cv'] = evaluate(GridSearchCV(model, params[name], scoring='f1_macro'), X_train_cv, Y_train, X_test_cv, Y_test, X_dataset_cv, Y_dataset, name=name)

In [None]:
# Recreamos modelos con parámetros optimizados para el entrenamiento final
models = [
  ExtraTreesClassifier(**results['ExtraTreesClassifier']['cv']['best_params']),
  GradientBoostingClassifier(**results['GradientBoostingClassifier']['cv']['best_params']),
  RandomForestClassifier(**results['RandomForestClassifier']['cv']['best_params']),
  AdaBoostClassifier(**results['AdaBoostClassifier']['cv']['best_params']),
  BernoulliNB(**results['BernoulliNB']['cv']['best_params']),
]

In [None]:
train_df = read_dataset(inTrain)
test_df = read_dataset(inTest)
dataset_df = read_dataset(inDatasetTest)

In [None]:
# Subimos a 9k instancias
train_df = train_df.groupby("model").sample(n=5000, random_state=random_seed)

# vectorize labels : from text to numeric vectors
le = LabelEncoder()
Y_train = le.fit_transform(train_df["model"])
Y_test = le.transform(test_df["model"])
Y_dataset = le.transform(dataset_df["model"])

vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1,3))
X_train_full, X_test_full, X_dataset_full = vectorize(vectorizer, train_df["text"], test_df["text"], dataset_df["text"])

In [None]:
dataset_stats(train_df, le.classes_)

In [None]:
with parallel_backend('threading', n_jobs=multiprocessing.cpu_count()):
  for model in models:
    results[model.__class__.__name__]['full-datasets'] = evaluate(model, X_train_full, Y_train, X_test_full, Y_test, X_dataset_full, Y_dataset)

In [None]:
def plot_results(model_names, scores, title):
  x = np.arange(len(model_names)) # the label locations
  width = 0.15  # the width of the bars
  multiplier = 0

  fig, ax = plt.subplots(layout='constrained')

  for test, results in scores.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, results, width, label=test)
    ax.bar_label(rects, padding=3, labels=[f'{r:.2f}' for r in results], rotation='vertical')
    multiplier += 1

  ax.set_title(title)
  ax.set_xticks(x + width, model_names, rotation=30)
  ax.legend(loc='upper left', ncols=3)
  ax.set_ylim(0, 1.2)

  plt.savefig(os.path.join('/content/mount', 'subtaskB', f'{title}.png'))
  plt.show()

In [None]:
scores = {
  'f1': {
    'baseline': [],
    'stopwords': [],
    'ngrams': [],
    'cv': [],
    'full-datasets': [],
  },
  'accuracy': {
    'baseline': [],
    'stopwords': [],
    'ngrams': [],
    'cv': [],
    'full-datasets': [],
  },
  'precision': {
    'baseline': [],
    'stopwords': [],
    'ngrams': [],
    'cv': [],
    'full-datasets': [],
  },
  'recall': {
    'baseline': [],
    'stopwords': [],
    'ngrams': [],
    'cv': [],
    'full-datasets': [],
  },
}
model_names = []

for model, tests in results.items():
  model_names.append(model)
  for test, result in tests.items():
    scores['f1'][test].append(result['test']['macro avg']['f1-score'])
    scores['accuracy'][test].append(result['test']['accuracy'])
    scores['precision'][test].append(result['test']['macro avg']['precision'])
    scores['recall'][test].append(result['test']['macro avg']['recall'])

In [None]:
plot_results(model_names, scores['f1'], 'F1 macro scores')

In [None]:
plot_results(model_names, scores['accuracy'], 'Accuracy scores')

In [None]:
plot_results(model_names, scores['precision'], 'Precision scores')

In [None]:
plot_results(model_names, scores['recall'], 'Recall scores')

In [None]:
text_export = [
  {
      'Modelo': model_names[i],
      'Accuracy': scores['accuracy']['full-datasets'][i],
      'Precision': scores['precision']['full-datasets'][i],
      'Recall': scores['recall']['full-datasets'][i],
      'F1-score': scores['f1']['full-datasets'][i],
  } for i in range(len(model_names))
]
pd.DataFrame(text_export).to_csv('/content/mount/TaskB_SemEval.csv', index=False)

In [None]:
scores = {
  'f1': {
    'baseline': [],
    'stopwords': [],
    'ngrams': [],
    'cv': [],
    'full-datasets': [],
  },
  'accuracy': {
    'baseline': [],
    'stopwords': [],
    'ngrams': [],
    'cv': [],
    'full-datasets': [],
  },
  'precision': {
    'baseline': [],
    'stopwords': [],
    'ngrams': [],
    'cv': [],
    'full-datasets': [],
  },
  'recall': {
    'baseline': [],
    'stopwords': [],
    'ngrams': [],
    'cv': [],
    'full-datasets': [],
  },
}
model_names = []

for model, tests in results.items():
  model_names.append(model)
  for test, result in tests.items():
    scores['f1'][test].append(result['dataset']['macro avg']['f1-score'])
    scores['accuracy'][test].append(result['dataset']['accuracy'])
    scores['precision'][test].append(result['dataset']['macro avg']['precision'])
    scores['recall'][test].append(result['dataset']['macro avg']['recall'])

In [None]:
plot_results(model_names, scores['f1'], 'F1 macro scores')

In [None]:
plot_results(model_names, scores['accuracy'], 'Accuracy scores')

In [None]:
plot_results(model_names, scores['precision'], 'Precision scores')

In [None]:
plot_results(model_names, scores['recall'], 'Recall scores')

In [None]:
text_export = [
  {
      'Modelo': model_names[i],
      'Accuracy': scores['accuracy']['full-datasets'][i],
      'Precision': scores['precision']['full-datasets'][i],
      'Recall': scores['recall']['full-datasets'][i],
      'F1-score': scores['f1']['full-datasets'][i],
  } for i in range(len(model_names))
]
pd.DataFrame(text_export).to_csv('/content/mount/TaskB_Dataset.csv', index=False)

In [None]:
import pickle
import os

subtask="B"

os.makedirs(f'/content/mount/models{subtask}', exist_ok=True)

for model in models:
  name = model.__class__.__name__
  with open(f'/content/mount/models{subtask}/{name}.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'/content/mount/vectorizer{subtask}.pkl', 'wb') as f:
  pickle.dump(vectorizer, f)

with open(f'/content/mount/labels{subtask}.pkl', 'wb') as f:
  pickle.dump(le.classes_, f)