# Load Data Pre Process

In [1]:
import pandas as pd



In [2]:
from transformers import DistilBertTokenizerFast

In [3]:
from sklearn.model_selection import train_test_split



In [4]:
import torch

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier

In [6]:
import re
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

In [7]:
download("stopwords"), download('punkt'), download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


(True, True, True)

In [8]:
torch.cuda.is_available()

False

In [9]:
data_pre_processing_folder =  "../data/pre-processing/"
data_pre_processing_name = "data.csv"
label_pre_processing_name = "label.csv"
model_path =  "../models/classifier/"
tokenizer_path =  "../models/tokenizer/"

In [11]:
data = pd.read_csv(data_pre_processing_folder + data_pre_processing_name)["text"]
label = pd.read_csv(data_pre_processing_folder + label_pre_processing_name)["label"]

In [12]:
epochs = 3
num_labels = label.nunique()
lr = 5e-5

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data, label, stratify=label, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.25, random_state=1)

# Naif Model

In [14]:
def clean_text(text):
    # Transformar todo a minúsculas
    text = text.lower()
    
    # Eliminar símbolos no deseados
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Eliminar palabras de parada y aplicar lematización
    stop_words = set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    text = ' '.join([wordnet_lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words])
    
    return text


In [15]:
# Inicializar TF-IDF
vectorizer = TfidfVectorizer()

# Transformar las características de texto
X_train_v = vectorizer.fit_transform(X_train.apply(clean_text))
X_test_v = vectorizer.transform(X_test.apply(clean_text))
X_val_v = vectorizer.transform(X_val.apply(clean_text))

In [16]:
classifier = AdaBoostClassifier()
# Entrenar el clasificador con el conjunto de entrenamiento
classifier.fit(X_train_v, y_train)

In [17]:
# Evaluar la precisión sobre el conjunto de validación y prueba
print(f"Naif Model - Validation metrics precision:", classifier.score(X_val_v, y_val))
print(f"Naif Model -  Test metrics precision:", classifier.score(X_test_v, y_test))

Naif Model - Validation metrics precision: 0.5369348436441088
Naif Model -  Test metrics precision: 0.5375605033881897


# Large Language Model - Fine tuning

In [None]:
model_base_name = "distilbert-base-uncased-finetuned-sst-2-english"

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_base_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
X_train = tokenizer(X_train.to_list(), truncation=True, padding=True)
X_val = tokenizer(X_val.to_list(), truncation=True, padding=True)
X_test = tokenizer(X_test.to_list(), truncation=True, padding=True)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(X_train, y_train.to_list())
val_dataset = Dataset(X_val, y_val.to_list())
test_dataset = Dataset(X_test, y_test.to_list())

In [None]:
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_base_name, num_labels=num_labels)
model.to(device)
model.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [None]:
optim = AdamW(model.parameters(), lr=lr)

for epoch in range(epochs):
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        optim.zero_grad()
model.eval()



  0%|          | 0/1937 [00:00<?, ?it/s]

  0%|          | 0/1937 [00:00<?, ?it/s]

  0%|          | 0/1937 [00:00<?, ?it/s]

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:

model.save_pretrained(model_path, from_pt=True)
tokenizer.save_pretrained(tokenizer_path, from_pt=True) 

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model = model.to(device)

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained(tokenizer_path)

In [None]:
val_loader = DataLoader(val_dataset, batch_size=16)

In [None]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1)
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

In [None]:
import torch.nn.functional as F
from sklearn.metrics import f1_score

In [None]:
def validation(validation_dataloader):
  with torch.no_grad():
    loss_val_list = []
    preds_list = []
    accuracy_list = []
    accuracy_sum = 0
    for batch in tqdm(validation_dataloader):
      #print(batch.keys())
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
      loss = outputs[0]
      logits = F.softmax(outputs[1], dim=1)   # Taking the softmax of output
      _,preds = torch.max(logits, dim=1)      # Taking the predictions of our batch
      acc = accuracy(logits,labels)           # Calculating the accuracy of current batch
      accuracy_sum += acc                     # Taking sum of all the accuracies of all the batches. This sum will be divided by batch length to get mean accuracy for validation dataset

      loss_val_list.append(loss)
      preds_list.append(preds)
      accuracy_list.append(acc)

  mean_accuracy = accuracy_sum / len(validation_dataloader)
  return mean_accuracy, preds_list

In [None]:
mean_accuracy, preds_list = validation(val_loader)

In [None]:
import numpy as np

In [None]:
y_pred = np.concatenate( list(i.cpu().numpy() for i in preds_list), axis=0 )

In [None]:
f1_score(y_val, y_pred, average='weighted'), f1_score(y_val, y_pred, average='micro'), f1_score(y_val, y_pred, average='macro')

In [None]:
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
mean_accuracy, preds_list = validation(test_loader)

In [None]:
y_pred = np.concatenate( list(i.cpu().numpy() for i in preds_list), axis=0 )

In [None]:
f1_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='micro'), f1_score(y_test, y_pred, average='macro')