<a href="https://colab.research.google.com/github/Molter73/proyecto-computacion-1/blob/mauro%2Fcontainers/training/Entrenamiento_y_exportaci%C3%B3n_de_modelos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gdown==v4.6.3

![ ! -d /content/SemEval2024-Task8 ] && gdown --folder https://drive.google.com/drive/folders/14DulzxuH5TDhXtviRVXsH5e2JTY2POLi

In [None]:
pip install scikit-optimize

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.pipeline import Pipeline
import logging

In [None]:
# read dataset function
def read_dataset(inFile):
    print("\nReading:", inFile)
    data =  pd.read_json(inFile, lines=True)
    return data

In [None]:
subtask = "B"

if subtask == "A":
  # data paths and config
  inTrain = '/content/SemEval2024-Task8/SubtaskA/subtaskA_train_monolingual.jsonl'
  inTest = '/content/SemEval2024-Task8/SubtaskA/subtaskA_dev_monolingual.jsonl'
  inDatasetTest = '/content/mount/dataset.jsonl'

  max_instances_per_class = 20000
  target = "label"
elif subtask == "B":
  # data paths and config
  inTrain = '/content/SemEval2024-Task8/SubtaskB/subtaskB_train.jsonl'
  inTest = '/content/SemEval2024-Task8/SubtaskB/subtaskB_dev.jsonl'
  inDatasetTest = '/content/mount/dataset.jsonl'

  max_instances_per_class = 9000
  target = "model"
else:
  logging.error("Wrong subtask: {}. It should be A or B".format(subtask))
  raise ValueError("Wrong subtask: {}. It should be A or B".format(subtask))

In [None]:
# data paths and config

max_features = 20000 # maximum number of features extracted for our instances
random_seed = 777 # set random seed for reproducibility

In [None]:
# read dataset
train_df = read_dataset(inTrain)
test_df = read_dataset(inTest)
dataset_df = read_dataset(inDatasetTest)

In [None]:
# downsample training data to train faster
train_df = train_df.groupby(target).sample(n=max_instances_per_class, random_state=random_seed)

In [None]:
X_train = train_df["text"]
X_test = test_df["text"]
X_dataset = dataset_df["text"]

In [None]:
# vectorize labels : from text to numeric vectors
le = LabelEncoder()
Y_train = le.fit_transform(train_df[target])
Y_test = le.transform(test_df[target])
Y_dataset = le.transform(dataset_df[target])

In [None]:
if subtask == "A":
  id2label = ['human', 'machine']
else:
  id2label = le.classes_

In [None]:
models = {
  'models': {
    'BernoulliNB': {
        'pipeline': Pipeline([
          ('tfidf', TfidfVectorizer(max_features=max_features, stop_words="english", ngram_range=(1, 2), use_idf=True)),
          ('clf', BernoulliNB(alpha=0.01, fit_prior=True)),
        ]),
    },
    'MultinomialNB': {
        'pipeline': Pipeline([
          ('tfidf', TfidfVectorizer(max_features=max_features, stop_words="english", ngram_range=(1, 1), use_idf=False)),
          ('clf', MultinomialNB(alpha=1.0, fit_prior=True)),
        ]),
    },
    'ExtraTreesClassifier': {
        'pipeline': Pipeline([
          ('tfidf', TfidfVectorizer(max_features=max_features, stop_words="english", ngram_range=(1, 2), use_idf=True)),
          ('clf', ExtraTreesClassifier(criterion='entropy', n_estimators=100)),
        ]),
    },
    'PassiveAggressiveClassifier': {
        'pipeline': Pipeline([
          ('tfidf', TfidfVectorizer(max_features=max_features, stop_words="english", ngram_range=(1, 2), use_idf=False)),
          ('clf', PassiveAggressiveClassifier(C=0.5, max_iter=2000)),
        ]),
    },
    'LinearSVC': {
        'pipeline': Pipeline([
          ('tfidf', TfidfVectorizer(max_features=max_features, stop_words="english", ngram_range=(1, 2), use_idf=False)),
          ('clf', LinearSVC(max_iter=1000, penalty='l2')),
        ]),
    },
    'NearestCentroid': {
        'pipeline': Pipeline([
          ('tfidf', TfidfVectorizer(max_features=max_features, stop_words="english", ngram_range=(1, 2), use_idf=False)),
          ('clf', NearestCentroid()),
        ]),
    },
    'Perceptron': {
        'pipeline': Pipeline([
          ('tfidf', TfidfVectorizer(max_features=max_features, stop_words="english", ngram_range=(1, 2), use_idf=False)),
          ('clf', Perceptron(alpha=0.0001, max_iter=1000, penalty='l1')),
        ]),
    },
    'ComplementNB': {
        'pipeline': Pipeline([
          ('tfidf', TfidfVectorizer(max_features=max_features, stop_words="english", ngram_range=(1, 2), use_idf=False)),
          ('clf', ComplementNB(alpha=0.01, norm=True,)),
        ]),
    },
    'KNeighborsClassifier': {
        'pipeline': Pipeline([
          ('tfidf', TfidfVectorizer(max_features=max_features, stop_words="english", ngram_range=(1, 2), use_idf=False)),
          ('clf', KNeighborsClassifier(metric='euclidean')),
        ]),
    },
  },
  "labels": id2label,
}

In [None]:
for name, values in models['models'].items():
  pipeline = values['pipeline']
  pipeline.fit(X_train, Y_train)

  y_pred = pipeline.predict(X_test)
  score = f1_score(Y_test, y_pred, average="macro")

  print(f"Model: {name} Macro F1: {score}")
  print(classification_report(Y_test, y_pred, target_names=id2label))

  y_pred = pipeline.predict(X_dataset)
  score = f1_score(Y_dataset, y_pred, average="macro")

  print(f"Model: {name} Dataset Macro F1: {score}")
  print(classification_report(Y_dataset, y_pred, target_names=id2label))

In [None]:
import pickle
import os

os.makedirs(f'/content/mount/models{subtask}', exist_ok=True)

for name, values in models['models'].items():
  with open(f'/content/mount/models{subtask}/{name}.pkl', 'wb') as f:
    pickle.dump(values['pipeline'], f)

with open(f'/content/mount/labels{subtask}.pkl', 'wb') as f:
  pickle.dump(models['labels'], f)