# En este colab se finetunnea el modelo entrenado para entailment, sobre un dataset propio de saludos y ayuda, utilizando kfold.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/MAF-ProyectoNLP/Entrenamiento-Modelos/blob/main/Finetuning%20kFold%20DataSetPropio.ipynb)


*0*. Se instalan las bibliotecas necesarias

In [40]:
!pip install datasets
!pip install transformers



*1*. Se importan las dependencias

In [41]:
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import datasets
import numpy as np
import random
import pandas as pd
import gc

from transformers import  RobertaForSequenceClassification, RobertaConfig, RobertaTokenizer

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score

from transformers import AdamW

from sklearn.model_selection import KFold

from pydrive.auth import GoogleAuth
from google.colab import drive
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


In [42]:
torch.cuda.empty_cache()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

*2*. Se descarga el modelo entrenado, y se define la carpeta donde almacenar el modelo final

In [43]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

!mkdir "train"
file_id = '1jQq-kBYyn20G5JYmAqSF9WIwzBO8Sq5C'
download = drive.CreateFile({'id': file_id})
download.GetContentFile('train/config.json')
file_id ='19b0W3oVzq0UQvkZySJqjw-LwsVgqijqr'
download = drive.CreateFile({'id': file_id})
download.GetContentFile('train/pytorch_model.bin')

mkdir: cannot create directory ‘train’: File exists


In [44]:
carpeta='/content/drive/My Drive/Roberta-Large-BNE-TE-Kfold/kfold/'

In [45]:
MODEL_TYPE = "PlanTL-GOB-ES/roberta-large-bne-te"

In [46]:
MAX_LEN=256
BATCH_SIZE = 8
NUM_CORES=1
L_RATE = 1e-5

NUM_EPOCHS = 5
torch.manual_seed(555)

<torch._C.Generator at 0x7f7b3051e570>

In [47]:
dataset_conversaciones='/content/Entrenamiento-y-Validacion/dataset_propio_validation_set.csv'

Se define el kfold, dividiendo el dataset en 5

In [48]:
kfold=KFold(n_splits=5,shuffle=True)

*3*. TestDataset and CompDataset son las clases usadas para la tokenización de la frases, CompDataset se utiliza para entrenar (devuelve también el label), y TestDataset es utilizada para tokenizar las frases para validación

In [49]:
class TestDataset(Dataset):

    def __init__(self, df):
        self.df_data = df

    def __getitem__(self, index):
        sentence1 = self.df_data.loc[index, 'premise']
        sentence2 = self.df_data.loc[index, 'hypothesis']

        encoded_dict = tokenizer.encode_plus(
                    sentence1, sentence2,
                    add_special_tokens = True,
                    max_length = MAX_LEN,
                    pad_to_max_length = True,
                    return_attention_mask = True,
                    return_tensors = 'pt',
                    padding="max_length",
                    truncation=True
               )

        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]

        sample = (padded_token_list, att_mask)
        return sample

    def __len__(self):
        return len(self.df_data)

In [50]:
class CompDataset(Dataset):

    def __init__(self, df):
        self.df_data = df

    def __getitem__(self, index):
        sentence1 = self.df_data.loc[index, 'premise']
        sentence2 = self.df_data.loc[index, 'hypothesis']

        encoded_dict = tokenizer.encode_plus(
                    sentence1, sentence2,
                    add_special_tokens = True,
                    max_length = MAX_LEN,
                    pad_to_max_length = True,
                    return_attention_mask = True,
                    return_tensors = 'pt',
                    padding="max_length",
                    truncation=True
               )

        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]

        target = torch.tensor(self.df_data.loc[index, 'label'])

        sample = (padded_token_list, att_mask, target)
        return sample

    def __len__(self):
        return len(self.df_data)

*4*. Se carga el modelo

In [51]:
tokenizer = RobertaTokenizer.from_pretrained(MODEL_TYPE)

In [52]:
model_parameters =  RobertaForSequenceClassification.from_pretrained(MODEL_TYPE, num_labels=3)
config = RobertaConfig(
                vocab_size= model_parameters.config.vocab_size,
                hidden_size= model_parameters.config.hidden_size,
                num_hidden_layers= model_parameters.config.num_hidden_layers,
                num_attention_heads=model_parameters.config.num_attention_heads,
                intermediate_size=model_parameters.config.intermediate_size,
                hidden_act=model_parameters.config.hidden_act,
                hidden_dropout_prob=model_parameters.config.hidden_dropout_prob,
                attention_probs_dropout_prob=model_parameters.config.attention_probs_dropout_prob,
                max_position_embeddings=model_parameters.config.max_position_embeddings,
                type_vocab_size=model_parameters.config.type_vocab_size,
                initializer_range=model_parameters.config.initializer_range,
            )

config.num_labels = 3

print(config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50262
}



In [53]:
model = RobertaForSequenceClassification(config)
model = RobertaForSequenceClassification.from_pretrained('/content/train/')

In [54]:
print(model.roberta.embeddings.word_embeddings.weight.shape)

torch.Size([50262, 1024])


In [55]:
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50262, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

*5*. Se define un conjunto de frases para validar el algoritmo sobre conversaciones de prueba

In [56]:
!git clone "https://github.com/MAF-ProyectoNLP/Entrenamiento-y-Validacion"

fatal: destination path 'Entrenamiento-y-Validacion' already exists and is not an empty directory.


*6*. Entrenamiento

In [57]:
optimizer = AdamW(model.parameters(),
              lr = L_RATE,
              eps = 1e-8
            )



In [58]:
df = pd.read_csv(dataset_conversaciones, encoding='Windows-1252', delimiter=",")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148 entries, 0 to 147
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  148 non-null    int64 
 1   premise     148 non-null    object
 2   hypothesis  148 non-null    object
 3   label       148 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 4.8+ KB


In [59]:
df['label'].value_counts()

1    87
0    61
Name: label, dtype: int64

In [60]:
# Set the seed.
seed_val = 101

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

dataset = CompDataset(df)

for fold,(train_idx,test_idx) in enumerate(kfold.split(dataset)):
  print('\n\n========Fold no {}========'.format(fold))
  train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
  test_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)

  train_dataloader = torch.utils.data.DataLoader(
                      dataset,
                      batch_size=BATCH_SIZE, sampler=train_subsampler, num_workers=NUM_CORES)
  val_dataloader = torch.utils.data.DataLoader(
                      dataset,
                      batch_size=BATCH_SIZE, sampler=test_subsampler,num_workers=NUM_CORES)

  print("Train ",len(train_dataloader)," Val ",len(val_dataloader))

  # For each epoch...
  for epoch in range(0, NUM_EPOCHS):

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch + 1, NUM_EPOCHS))


      stacked_val_labels = []
      targets_list = []

      # ========================================
      #               Training
      # ========================================

      print('Training...')

      # put the model into train mode
      model.train()

      # This turns gradient calculations on and off.
      torch.set_grad_enabled(True)


      # Reset the total loss for this epoch.
      total_train_loss = 0

      for i, batch in enumerate(train_dataloader):

          train_status = 'Batch ' + str(i) + ' of ' + str(len(train_dataloader))

          print(train_status, end='\r')


          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)

          model.zero_grad()


          outputs = model(b_input_ids,
                      attention_mask=b_input_mask,
                      labels=b_labels)

          # Get the loss from the outputs tuple: (loss, logits)
          loss = outputs[0]

          # Convert the loss from a torch tensor to a number.
          # Calculate the total loss.
          total_train_loss = total_train_loss + loss.item()

          # Zero the gradients
          optimizer.zero_grad()

          # Perform a backward pass to calculate the gradients.
          loss.backward()


          # Clip the norm of the gradients to 1.0.
          # This is to help prevent the "exploding gradients" problem.
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)



          # Use the optimizer to update the weights.

          # Optimizer for GPU
          optimizer.step()


      print('Train loss:' ,total_train_loss)

      # ========================================
      #               Validation
      # ========================================

      print('\nValidation...')

      # Put the model in evaluation mode.
      model.eval()

      # Turn off the gradient calculations.
      # This tells the model not to compute or store gradients.
      # This step saves memory and speeds up validation.
      torch.set_grad_enabled(False)

      # Reset the total loss for this epoch.
      total_val_loss = 0


      for j, batch in enumerate(val_dataloader):

          val_status = 'Batch ' + str(j) + ' of ' + str(len(val_dataloader))

          print(val_status, end='\r')

          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)


          outputs = model(b_input_ids,
                  attention_mask=b_input_mask,
                  labels=b_labels)

          # Get the loss from the outputs tuple: (loss, logits)
          loss = outputs[0]

          # Convert the loss from a torch tensor to a number.
          # Calculate the total loss.
          total_val_loss = total_val_loss + loss.item()

          # Get the preds
          preds = outputs[1]

          # Move preds to the CPU
          val_preds = preds.detach().cpu().numpy()

          # Move the labels to the cpu
          targets_np = b_labels.to('cpu').numpy()

          # Append the labels to a numpy list
          targets_list.extend(targets_np)

          if j == 0:  # first batch
              stacked_val_preds = val_preds

          else:
              stacked_val_preds = np.vstack((stacked_val_preds, val_preds))


      # Calculate the validation accuracy
      y_true = targets_list
      y_pred = np.argmax(stacked_val_preds, axis=1)


      y_true = np.array(y_true)

      y_pred[y_pred==2] = 1

      val_acc = accuracy_score(y_true, y_pred)

      prec_sc = precision_score(y_true, y_pred)
      rec_sc = recall_score(y_true, y_pred)

      f1_sc = f1_score(y_true, y_pred)


      print('Val loss:' ,total_val_loss)
      print('Val acc: ', val_acc)
      print('\n\nPrecision: ', prec_sc)
      print('Recall: ', rec_sc)
      print('F1: ', f1_sc)


      if fold ==4 and epoch ==4:
        # Save the Model
        model.save_pretrained(carpeta)

      # Use the garbage collector to save memory.
      gc.collect()



Train  15  Val  4

Training...
Train loss: 6.924230232834816

Validation...
Val loss: 3.1304752826690674
Val acc:  0.7666666666666667


Precision:  0.7777777777777778
Recall:  0.8235294117647058
F1:  0.7999999999999999

Training...
Train loss: 2.988842346239835

Validation...
Val loss: 3.9976515262387693
Val acc:  0.8


Precision:  0.7894736842105263
Recall:  0.8823529411764706
F1:  0.8333333333333333

Training...
Train loss: 1.2445435036206618

Validation...
Val loss: 2.8522619060240686
Val acc:  0.8333333333333334


Precision:  0.8
Recall:  0.9411764705882353
F1:  0.8648648648648648

Training...
Train loss: 1.0415815755259246

Validation...
Val loss: 3.059456020593643
Val acc:  0.8


Precision:  0.7894736842105263
Recall:  0.8823529411764706
F1:  0.8333333333333333

Training...
Train loss: 0.017492094659246504

Validation...
Val loss: 3.1991402862477116
Val acc:  0.8


Precision:  0.7894736842105263
Recall:  0.8823529411764706
F1:  0.8333333333333333


Train  15  Val  4

Training..