<a href="https://colab.research.google.com/github/Molter73/proyecto-computacion-1/blob/mauro%2Fnotebooks/Entrenamiento_validaci%C3%B3n_cruzada.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gdown==v4.6.3

![ ! -d /content/SemEval2024-Task8 ] && gdown --folder https://drive.google.com/drive/folders/14DulzxuH5TDhXtviRVXsH5e2JTY2POLi

In [None]:
pip install scikit-optimize

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB
from sklearn.ensemble import ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.pipeline import Pipeline
import logging

In [None]:
# read dataset function
def read_dataset(inFile):
    print("\nReading:", inFile)
    data =  pd.read_json(inFile, lines=True)
    return data

In [None]:
subtask = "A"

if subtask == "A":
  # data paths and config
  inTrain = '/content/SemEval2024-Task8/SubtaskA/subtaskA_train_monolingual.jsonl'
  inTest = '/content/SemEval2024-Task8/SubtaskA/subtaskA_dev_monolingual.jsonl'
  inDatasetTest = '/content/mount/dataset.jsonl'

  target = "label"
elif subtask == "B":
  # data paths and config
  inTrain = '/content/SemEval2024-Task8/SubtaskB/subtaskB_train.jsonl'
  inTest = '/content/SemEval2024-Task8/SubtaskB/subtaskB_dev.jsonl'
  inDatasetTest = '/content/mount/dataset.jsonl'

  target = "model"
else:
  logging.error("Wrong subtask: {}. It should be A or B".format(subtask))
  raise ValueError("Wrong subtask: {}. It should be A or B".format(subtask))

In [None]:
# data paths and config

max_instances_per_class = 2000
max_features = 2000 # maximum number of features extracted for our instances
random_seed = 777 # set random seed for reproducibility

In [None]:
# read dataset
train_df = read_dataset(inTrain)
test_df = read_dataset(inTest)
dataset_df = read_dataset(inDatasetTest)

In [None]:
# downsample training data to train faster
train_df = train_df.groupby(target).sample(n=max_instances_per_class, random_state=random_seed)

In [None]:
X_train = train_df["text"]
X_test = test_df["text"]
X_dataset = dataset_df["text"]

In [None]:
# vectorize labels : from text to numeric vectors
le = LabelEncoder()
Y_train = le.fit_transform(train_df[target])
Y_test = le.transform(test_df[target])
Y_dataset = le.transform(dataset_df[target])

In [None]:
tfidf_args = {
  'tfidf__max_features': [2000],
  'tfidf__stop_words': ["english"],
  'tfidf__ngram_range': [(1,1),(1,2)],
  'tfidf__use_idf': [True, False],
}
models = {
  'BernoulliNB': {
      'pipeline': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', BernoulliNB()),
      ]),
      'args': {
        'clf__alpha': [1.0, 0.01, 10.0],
        'clf__fit_prior': [True, False]
      },
  },
  'MultinomialNB': {
      'pipeline': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', MultinomialNB()),
      ]),
      'args': {
        'clf__alpha': [1.0, 0.01, 10.0],
        'clf__fit_prior': [True, False]
      },
  },
  'ExtraTreesClassifier': {
      'pipeline': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', ExtraTreesClassifier()),
      ]),
      'args': {
        'clf__n_estimators': [100],
        'clf__criterion': ['gini', 'entropy', 'log_loss'],
      },
  },
  'PassiveAggressiveClassifier': {
      'pipeline': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', PassiveAggressiveClassifier()),
      ]),
      'args': {
        'clf__C': [1.0, 0.5, 10.0],
        'clf__max_iter': [1000, 500, 2000],
      },
  },
  'LinearSVC': {
      'pipeline': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LinearSVC()),
      ]),
      'args': {
        'clf__penalty': ['l1', 'l2'],
        'clf__max_iter': [1000, 500, 2000],
      },
  },
  'NearestCentroid': {
      'pipeline': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', NearestCentroid()),
      ]),
      'args': {
        'clf__metric': ['euclidean', 'manhattan'],
      },
  },
  'Perceptron': {
      'pipeline': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', Perceptron()),
      ]),
      'args': {
        'clf__penalty': ['l1', 'l2', 'elasticnet'],
        'clf__alpha': [0.0001, 0.1, 1.0],
        'clf__max_iter': [1000, 500, 2000],
      },
  },
  'ComplementNB': {
      'pipeline': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', ComplementNB()),
      ]),
      'args': {
        'clf__alpha': [1.0, 0.01, 10.0],
        'clf__norm': [True, False],
      },
  },
  'KNeighborsClassifier': {
      'pipeline': Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', KNeighborsClassifier()),
      ]),
      'args': {
        'clf__n_neighbors': [5, 10, 2],
        'clf__weights': ['uniform', 'distance'],
      },
  },
}

In [None]:
def fine_tune(name, values):
  args = values['args'].update(tfidf_args)
  print(f"Optimizando {name}")
  grid_search = GridSearchCV(values['pipeline'], values['args'], scoring='f1_macro', n_jobs=-1)
  grid_search.fit(X_train, Y_train)
  print(f'parameters: {grid_search.best_params_}')

  y_pred = grid_search.predict(X_test)
  score = f1_score(Y_test, y_pred, average="macro")
  print(f"Macro F1: {score}")
  print(classification_report(Y_test, y_pred))

In [None]:
for name, values in models.items():
  fine_tune(name, values)

In [None]:
values = models['NearestCentroid']
fine_tune('NearestCentroid', values)