In [None]:
!pip install transformers
!pip install git+https://github.com/huggingface/peft.git
!pip install bitsandbytes
!pip install evaluate

## Imports  

In [2]:
#General imports
import numpy as np 
import pandas as pd 
import gdown 
import os
import requests
import string
import re 
import shutil
from utils import *


from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm



#Related to transformers and models
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
    TextClassificationPipeline,
)

from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch 
import torch.nn as nn
import random


#Result imports
import matplotlib.pyplot as plt


#GENERAL CONSTANTS
SEED = 42

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seed(SEED)




Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.0
CUDA SETUP: Detected CUDA version 120
CUDA SETUP: Loading binary /usr/local/lib/python3.8/dist-packages/bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


## Descarga del modelo a fine tunear 

In [3]:
model_name = "bhadresh-savani/bert-base-go-emotion"

tokenizer = AutoTokenizer.from_pretrained(model_name)

modelo_base  = AutoModelForSequenceClassification.from_pretrained("bhadresh-savani/bert-base-go-emotion")

## Descarga y tratamiento de los datos 

In [4]:
#Fichero se encuentra en enlace externo
url = "https://raw.githubusercontent.com/PoorvaRane/Emotion-Detector/master/ISEAR.csv"
output_file = "ISEAR.csv"

destination_folder = "data"
#Si no existe la carpeta en la que queremos guardar los datos debemos crearla
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

response = requests.get(url)
with open(output_file, 'wb') as f:
    f.write(response.content)

shutil.move(output_file, f"{destination_folder}/{output_file}")

# Load and preprocess the dataset
df = load_and_preprocess_data('./data/ISEAR.csv')
df['Emotion'] = df['Emotion'].replace('guit', 'guilt')
df

Unnamed: 0,Emotion,Text,Text_processed
0,joy,On days when I feel close to my partner and ot...,on days when i feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...,every time i imagine that someone i love or i ...
2,anger,When I had been obviously unjustly treated and...,when i had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...,when i think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...,at a gathering i found myself involuntarily si...
...,...,...,...
7511,shame,Two years back someone invited me to be the tu...,two years back someone invited me to be the tu...
7512,shame,I had taken the responsibility to do something...,i had taken the responsibility to do something...
7513,fear,I was at home and I heard a loud sound of spit...,i was at home and i heard a loud sound of spit...
7514,guilt,I did not do the homework that the teacher had...,i did not do the homework that the teacher had...


## Probamos el modelo a ver de cuantas clases dispone  

In [5]:
pipe = TextClassificationPipeline(model=modelo_base, tokenizer=tokenizer, return_all_scores=True)
resultados = pipe("fuck you leave me alone")
print(resultados)
accum = 0
for elemento in resultados: 
    for cuenta in elemento:
        accum = accum + 1 
print(accum)

[[{'label': 'admiration', 'score': 0.0022283862344920635}, {'label': 'amusement', 'score': 0.00207656342536211}, {'label': 'anger', 'score': 0.6765081286430359}, {'label': 'annoyance', 'score': 0.11713651567697525}, {'label': 'approval', 'score': 0.006938391365110874}, {'label': 'caring', 'score': 0.008048061281442642}, {'label': 'confusion', 'score': 0.0015102234901860356}, {'label': 'curiosity', 'score': 0.0022445875220000744}, {'label': 'desire', 'score': 0.0015093624824658036}, {'label': 'disappointment', 'score': 0.018241364508867264}, {'label': 'disapproval', 'score': 0.016932494938373566}, {'label': 'disgust', 'score': 0.04833042994141579}, {'label': 'embarrassment', 'score': 0.0041035753674805164}, {'label': 'excitement', 'score': 0.002686750376597047}, {'label': 'fear', 'score': 0.0032824971713125706}, {'label': 'gratitude', 'score': 0.0020939591340720654}, {'label': 'grief', 'score': 0.002857409417629242}, {'label': 'joy', 'score': 0.0016693678917363286}, {'label': 'love', 's



Como observamos en este modelo se tienen 28 clases (clásico del dataset de goemotion), ahora tenemos varias opciones, podemos intentar modificar esta ultima capa para poder trabajar con los logits, lo cual numéricamente es más estable o añadir una capa final de clasificación después de esta para clasificar entre las emociones de ekman que estamos utilizando en nuestro estudio. 

No sabemos el método que han usado para clasificar las emociones. Por tanto lo primero que vamos a hacer va a ser realizar un entrenamiento básico añadiendo una capa final para clasificar lo que nos interesa

## Exploración del modelo 

In [6]:
print(modelo_base.config)
print(modelo_base.num_labels)

BertConfig {
  "_name_or_path": "bhadresh-savani/bert-base-go-emotion",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMultilabelSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dim": 768,
  "dropout": 0.1,
  "hidden_act": "gelu",
  "hidden_dim": 3072,
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "admiration",
    "1": "amusement",
    "2": "anger",
    "3": "annoyance",
    "4": "approval",
    "5": "caring",
    "6": "confusion",
    "7": "curiosity",
    "8": "desire",
    "9": "disappointment",
    "10": "disapproval",
    "11": "disgust",
    "12": "embarrassment",
    "13": "excitement",
    "14": "fear",
    "15": "gratitude",
    "16": "grief",
    "17": "joy",
    "18": "love",
    "19": "nervousness",
    "20": "optimism",
    "21": "pride",
    "22": "realization",
    "23": "relief",
    "24": "remorse",
    "25": "sadness",
    "26": "surprise",
 

El objeto `BertConfig` contiene los parámetros y configuraciones necesarios para el modelo BERT utilizado en la clasificación de secuencias con múltiples etiquetas. A continuación se presenta una explicación de los campos clave en la configuración:

- `hidden_size` y `num_attention_heads`: Estos parámetros determinan la dimensionalidad de los vectores ocultos y el número de cabezas de atención en el modelo BERT, respectivamente.

- `num_hidden_layers`: Indica el número de capas ocultas en el modelo. Aumentar este valor puede aumentar la capacidad del modelo para aprender representaciones más complejas.

- `dropout`: Controla la tasa de abandono (dropout) aplicada a las salidas de las capas ocultas para evitar el sobreajuste y mejorar la generalización.

- `initializer_range`: Especifica el rango de inicialización para los pesos del modelo.

- `max_position_embeddings`: Define la longitud máxima permitida de las secuencias de entrada.

- `vocab_size`: Indica el tamaño del vocabulario utilizado por el modelo.

Estos campos son cruciales para configurar correctamente el modelo BERT en la clasificación de secuencias con múltiples etiquetas. Sin embargo, también debemos considerar otros aspectos como el modelo preentrenado, el conjunto de datos y los hiperparámetros específicos del entrenamiento.


In [7]:
texto = "I love this movie!"
inputs = tokenizer(texto, return_tensors="pt", truncation=True, padding=True)
outputs = modelo_base(**inputs)

# Obtener los logits
logits = outputs.logits

print(logits)

tensor([[-2.2273, -4.7028, -5.9401, -5.2675, -3.2153, -4.5923, -5.4052, -4.9632,
         -4.6493, -5.4046, -5.5101, -6.3988, -6.0861, -3.6266, -6.5459, -4.0359,
         -7.2671, -2.7496,  1.5563, -7.0032, -4.6198, -6.0098, -4.3959, -6.7773,
         -6.5881, -5.3570, -5.3487, -3.5594]], grad_fn=<AddmmBackward0>)


Ahora queremos obtener cual es la clase que es más probable que se haya predicho

In [8]:
probabilidades = torch.softmax(logits, dim=1)

# Obtener la clase con la probabilidad más alta
clase_predicha_idx = np.argmax(probabilidades.detach().numpy())
clase_predicha = modelo_base.config.id2label[clase_predicha_idx]

print("Clase predicha (índice):", clase_predicha_idx)
print("Clase predicha (texto):", clase_predicha)

Clase predicha (índice): 18
Clase predicha (texto): love


## Creación de la clase  

In [9]:
import torch.nn as nn

class MiModelo(nn.Module):
    def __init__(self, modelo_base, num_clases):
        super(MiModelo, self).__init__()
        
        # Definimos el modelo base que estará preentrenado
        self.modelo_base = modelo_base

        # Capa de dropout para evitar sobreajuste, con probabilidad de dropout de 0.5
        self.dropout = nn.Dropout(0.5)
        
        # Capa totalmente conectada que transforma la salida del modelo base en una salida de tamaño 14
        self.fc1 = nn.Linear(28, 14)
        
        # Función de activación ReLU
        self.activacion = nn.ReLU()
        
        # Segunda capa totalmente conectada que transforma la salida de tamaño 14 a la cantidad de clases
        self.fc2 = nn.Linear(14, num_clases)

    def forward(self, input_ids, attention_mask):
        # Pasamos los inputs a través del modelo base
        outputs = self.modelo_base(input_ids=input_ids, attention_mask=attention_mask)

        # Tomamos la salida del modelo base
        last_hidden_state = outputs[0]

        # Aplicamos dropout a la salida del modelo base
        pooled_output = self.dropout(last_hidden_state)

        # Pasamos la salida por la primera capa completamente conectada
        hidden = self.fc1(pooled_output)

        # Aplicamos la función de activación
        hidden = self.activacion(hidden)

        # Pasamos la salida por la segunda capa completamente conectada para obtener los logits finales
        logits = self.fc2(hidden)

        return logits




    def training_step(self, inputs, targets):
        # Obtenemos los logits pasando los inputs por el modelo
        logits = self.forward(inputs.input_ids, inputs.attention_mask)
        
        # Definimos la función de pérdida (Cross Entropy Loss)
        loss_fn = nn.CrossEntropyLoss()
        
        # Calculamos la pérdida
        loss = loss_fn(logits, targets)
        
        return loss


In [10]:
num_clases = 7
mi_modelo = MiModelo(modelo_base=modelo_base, num_clases=num_clases)


### Entrenamiento del modelo 

In [11]:
inputs = df['Text_processed'].tolist()
labels = df['Emotion'].tolist()

# Convertir las etiquetas a números
le = LabelEncoder()
labels = le.fit_transform(labels)

# tokenizar inputs
inputs = tokenizer.batch_encode_plus(
    inputs,
    padding='longest', 
    truncation=True, 
    max_length=512, 
    return_tensors="pt"
)


In [12]:
class MyDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Crear conjunto de datos
dataset = MyDataset(inputs, labels)


In [13]:
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])


In [14]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [15]:
num_clases = len(le.classes_)
model = MiModelo(modelo_base, num_clases)

total_layers = len(list(model.modelo_base.parameters()))
unfrozen_layers = total_layers - 2  # Descongelar las últimas dos capas

for i, param in enumerate(model.modelo_base.parameters()):
    if i >= unfrozen_layers:
        param.requires_grad = True
    else:
        param.requires_grad = False

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()


In [16]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()

In [1]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 300
best_val_loss = float('inf')
epochs_no_improve = 0

train_losses = []
val_losses = []
val_accuracies = []

model.to(device)

for epoch in range(num_epochs):
        
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader):
        inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_losses.append(train_loss/len(train_loader))
    model.eval()
    val_loss = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            loss = loss_fn(outputs, labels)
            val_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
        val_accuracy = correct_predictions.double() / len(val_dataset)
        val_accuracies.append(val_accuracy)
        val_losses.append(val_loss/len(val_loader))
    print(f'epoch: {epoch} train_loss {train_loss} val_loss {val_loss} val_accuracy {val_accuracy}')

    # Implement Early Stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        # Check early stopping condition
        if epochs_no_improve == early_stop_epochs:
            print('Early stopping!')
            break


SyntaxError: EOL while scanning string literal (1088397419.py, line 42)

In [None]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot([acc.cpu() for acc in val_accuracies], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
lr = 1e-3
batch_size = 16
num_epochs = 10


tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

peft_config = LoraConfig(
    task_type="CAUSAL_LM", inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all"
)



In [None]:
MiModelo.push_to_hub(repo_id="RikoteMaster/Bert_peft")

In [None]:
model = get_peft_model(mi_modelo, peft_config)
model.print_trainable_parameters()