In [6]:
import torch
import pandas as pd
from tqdm.notebook import tqdm
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

df= pd.read_csv(
    "/content/smileannotationsfinal.csv",
    names =["id", "text", "category"]
)
#set unique index to Id
df.set_index("id",inplace=True)

#print(df.head())
#print(df.text.iloc[0])
# print(df.category.value_counts())

#sacar nocode (emocion no determinada) y multi emociones con separador sad|disgust
df = df[~df.category.str.contains("\|")]
df = df[df.category != "nocode"]
print(df.category.value_counts())

#marcamos label_dict para emociones de 0 a 5 en este caso y
# lo ponemos en una columna extra y continuacion le cambiamos el nombre a label
possible_labels = df.category.unique()
label_dict= {}
for index, possible_label in enumerate (possible_labels):
    label_dict[possible_label]= index
print(label_dict)

df["label"]= df.category.replace(label_dict)
print(df.head(10))

happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: category, dtype: int64
{'happy': 0, 'not-relevant': 1, 'angry': 2, 'disgust': 3, 'sad': 4, 'surprise': 5}
                                                                 text  \
id                                                                      
614484565059596288  Dorian Gray with Rainbow Scarf #LoveWins (from...   
614746522043973632  @SelectShowcase @Tate_StIves ... Replace with ...   
614877582664835073  @Sofabsports thank you for following me back. ...   
611932373039644672  @britishmuseum @TudorHistory What a beautiful ...   
611570404268883969  @NationalGallery @ThePoldarkian I have always ...   
614499696015503361  Lucky @FitzMuseum_UK! Good luck @MirandaStearn...   
613601881441570816  Yr 9 art students are off to the @britishmuseu...   
613696526297210880  @RAMMuseum Please vote for us as @sainsbury #s...   
610746718641102848  #AskTheGalle

In [7]:
#Training/Validation Split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size=0.15,
    random_state=17,
    stratify=df.label.values #seleccione de cada caracteristica por igual
)
#nueva columna data_type
df["data_type"]=["not_set"]* df.shape[0]
print(df.head(10))

#statify para que saque x_train y X_val por igual de cada caracterisica
df.loc[X_train, "data_type"] = "train"
df.loc[X_val, "data_type"] = "val"

#agrupamos y contamos
df.groupby(["category", "label", "data_type"]).count()
print(df.head(30))

                                                                 text  \
id                                                                      
614484565059596288  Dorian Gray with Rainbow Scarf #LoveWins (from...   
614746522043973632  @SelectShowcase @Tate_StIves ... Replace with ...   
614877582664835073  @Sofabsports thank you for following me back. ...   
611932373039644672  @britishmuseum @TudorHistory What a beautiful ...   
611570404268883969  @NationalGallery @ThePoldarkian I have always ...   
614499696015503361  Lucky @FitzMuseum_UK! Good luck @MirandaStearn...   
613601881441570816  Yr 9 art students are off to the @britishmuseu...   
613696526297210880  @RAMMuseum Please vote for us as @sainsbury #s...   
610746718641102848  #AskTheGallery Have you got plans to privatise...   
612648200588038144               @BarbyWT @britishmuseum so beautiful   

                        category  label data_type  
id                                                 
614484565059596288 

In [8]:
#LOADING TOKENIZER AND ENCODING OUR DATA

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

tokenizer = BertTokenizer.from_pretrained(
    "bert-base-uncased",
    do_lower_case=True
)

encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=="train"].text.values,
    add_special_tokens=True,
    #bert indica donde comienza y donde acaba sentence(frase)
    return_attention_mask=True, #para igualar la cantidad de las palabras en una frase
    #igualar la dimencion de todas frases
    pad_to_max_length=True,
    max_length=256, #longitud maxima de palabras
    return_tensors="pt" #pytorch
)
encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=="val"].text.values,
    add_special_tokens=True,
    #bert indica donde comienza y donde acaba sentence(frase)
    return_attention_mask=True, #para igualar la cantidad de las palabras en una frase
    #igualar la dimencion de todas frases
    pad_to_max_length=True,
    max_length=256, #longitud maxima de palabras
    return_tensors="pt" #pytorch
)
input_ids_train = encoded_data_train["input_ids"]#accedemos al diccionario
attention_masks_train = encoded_data_train["attention_mask"]
labels_train = torch.tensor(df[df.data_type =="train"].label.values)

input_ids_val = encoded_data_val["input_ids"]
attention_masks_val = encoded_data_val["attention_mask"]
labels_val = torch.tensor(df[df.data_type=="val"].label.values)

dataset_train = TensorDataset(input_ids_train,
                                attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val,
                                attention_masks_val, labels_val)

len(dataset_train)
print(len(dataset_train))
len(dataset_val)
print(len(dataset_val))


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


1258
223


In [25]:

#SETTING UP BERT PRETRAIN MODEL
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
     num_labels = len(label_dict),
     output_attentions=False,
     output_hidden_states=False
)

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size=4 #32
dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)
#dataloader_val = DataLoader(
    #dataset_val,
    #sampler=RandomSampler(dataset_train),
    #batch_size=32
#)
dataloader_val = DataLoader(
    dataset_val,
    sampler=SequentialSampler(dataset_val),  # Usar SequentialSampler para el conjunto de validación
    batch_size=32
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
#SETTING UP OPTIMIZER AND SCHEDULE
from transformers  import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(
    model.parameters(),
    lr=1e-5, #2e-5>5e-5
    eps=1e-8 #epsilon
)
epochs = 10

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*epochs

)

In [27]:
#DEFINING OUR PERFORMANCE METRICS

import numpy as np
from sklearn.metrics import f1_score

#preds [1 0 0 0 0 0] predicir la probilidad
#check tutorial

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average="weighted")#para tener en cuenta que los clases estan desbalanceadas

def accurancy_per_class (preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}# key value debe ser separado por :

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]#numpy build indexing. solo escogemos preds_flat donde labels_flat es igual a label
        y_true = labels_flat[labels_flat==label]
        print(f"Class: {label_dict_inverse[label]}")
        print(f"Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n")



In [28]:
import random


seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    print(len(dataloader_val))


    for batch in dataloader_val:


        for tensor in batch:
            print(tensor.shape)



        batch = tuple(b.to(device) for b in batch)
        inputs = {
                    "input_ids":       batch[0],
                    "attention_mask":   batch[1],
                    "labels":           batch[2],
                    } #aqui tenia coma
        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]#use a logits as prediccion
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs["labels"].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train,
                        desc="Epoch {:1d}".format(epoch),
                        leave=False,
                        disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {
                "input_ids"      : batch[0],
                "attention_mask" : batch[1],
                "labels"         : batch[2]
                }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({"training_loss" : "{:.3f}".format(loss.item()/len(batch))})

    torch.save(model.state_dict(), "/content/models/Bert_ft_epoch{epoch}.model")


    tqdm.write("\nEpoch {epoch}")

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f"Training loss: {loss_train_avg}")

    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f"Validation loss: {val_loss}")
    tqdm.write(f"F1 score (weighted): {val_f1}")




cuda


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.8098909631726288
7
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([31, 256])
torch.Size([31, 256])
torch.Size([31])
Validation loss: 0.5818264058658055
F1 score (weighted): 0.7757112401676101


Epoch 2:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.4604430984735252
7
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([31, 256])
torch.Size([31, 256])
torch.Size([31])
Validation loss: 0.5113449245691299
F1 score (weighted): 0.8099679545363836


Epoch 3:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.30637795564363757
7
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([31, 256])
torch.Size([31, 256])
torch.Size([31])
Validation loss: 0.705620849771159
F1 score (weighted): 0.8583726690176599


Epoch 4:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.20992188282414442
7
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([31, 256])
torch.Size([31, 256])
torch.Size([31])
Validation loss: 0.6022632909672601
F1 score (weighted): 0.8427508836167272


Epoch 5:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.13442399120309376
7
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([31, 256])
torch.Size([31, 256])
torch.Size([31])
Validation loss: 0.6029221819979804
F1 score (weighted): 0.8632670452826223


Epoch 6:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.08005401790881204
7
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([31, 256])
torch.Size([31, 256])
torch.Size([31])
Validation loss: 0.6456263491085598
F1 score (weighted): 0.8584764146852356


Epoch 7:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.04756818036474879
7
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([31, 256])
torch.Size([31, 256])
torch.Size([31])
Validation loss: 0.7315621695348195
F1 score (weighted): 0.8687745678513286


Epoch 8:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.03499738769790542
7
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([31, 256])
torch.Size([31, 256])
torch.Size([31])
Validation loss: 0.7061194096292768
F1 score (weighted): 0.8621090704018509


Epoch 9:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.02479973791688237
7
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([31, 256])
torch.Size([31, 256])
torch.Size([31])
Validation loss: 0.7188654861279896
F1 score (weighted): 0.8531302527634135


Epoch 10:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.02412171077485832
7
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([31, 256])
torch.Size([31, 256])
torch.Size([31])
Validation loss: 0.723776346870831
F1 score (weighted): 0.8531302527634135


In [29]:
#TRAINING Y EVALUATING OUR MODEL
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                            num_labels=len(label_dict),
                                                            output_attentions=False,
                                                            output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
model.to(device)
pass

In [31]:
model.load_state_dict(
    torch.load("/content/models/Bert_ft_epoch{epoch}.model",
    map_location=torch.device("cpu"))
)

<All keys matched successfully>

In [36]:
_, predictions, true_vals = evaluate(dataloader_val)

7
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([32, 256])
torch.Size([32, 256])
torch.Size([32])
torch.Size([31, 256])
torch.Size([31, 256])
torch.Size([31])


In [37]:
accurancy_per_class(predictions, true_vals)

Class: happy
Accuracy: 161/171

Class: not-relevant
Accuracy: 19/32

Class: angry
Accuracy: 6/9

Class: disgust
Accuracy: 0/1

Class: sad
Accuracy: 2/5

Class: surprise
Accuracy: 3/5



In [None]:
#batch_size=32