# Transfer Learning

In [100]:
import math
import pandas as pd
import numpy as np
import warnings
import spacy
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


import transformers
from tokenizers import BertWordPieceTokenizer
from keras.utils import to_categorical

import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertModel, DistilBertTokenizer
import torch.nn as nn
import torch.optim as optim

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)


## Load data

In [2]:
df_train = pd.read_csv('data/dados_treino.txt', header=None, delimiter=';')
df_test = pd.read_csv('data/dados_teste.txt', header=None, delimiter=';')

df_train = df_train.rename(columns= {0: 'text', 1: 'feeling'})
df_test = df_test.rename(columns= {0: 'text', 1: 'feeling'})

print(df_train.shape)
print(df_test.shape)

display(df_train.head(3))
display(df_test.head(3))

(16000, 2)
(2000, 2)


Unnamed: 0,text,feeling
0,i am feeling completely overwhelmed i have two strategies that help me to feel grounded pour my heart out in my journal in the form of a letter to god and then end with a list of five things i am most grateful for,fear
1,i have the feeling she was amused and delighted,joy
2,i was able to help chai lifeline with your support and encouragement is a great feeling and i am so glad you were able to help me,joy


Unnamed: 0,text,feeling
0,i feel like my only role now would be to tear your sails with my pessimism and discontent,sadness
1,i feel just bcoz a fight we get mad to each other n u wanna make a publicity n let the world knows about our fight,anger
2,i feel like reds and purples are just so rich and kind of perfect,joy


In [3]:
df_train['feeling'].value_counts()

feeling
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [4]:
df_test['feeling'].value_counts()

feeling
joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: count, dtype: int64

> The column **text** is going to be the input feature and **feeling** is going to be the output target.

## Data preprocessing using Spacy

[Oficial site](https://spacy.io/).

In [12]:
# !python -m spacy download en_core_web_md -q

In [13]:
# Load the dict
spacy_nlp = spacy.load('en_core_web_md')

In [14]:
# Definition of the 'data_preprocessing' function that receives a text as a parameter
def data_preprocessing(text):

    # Process the text using the dictionary
    doc = spacy_nlp(text)

    # Creates a list of lemmas from the tokens, converted to lowercase and without whitespace,
    # excluding words that are stopwords
    tokens = [token.lemma_.lower().strip() for token in doc if not token.is_stop]

    # Returns the processed tokens as a single string, joining them with spaces
    return ' '.join(tokens)

In [15]:
df_train['transformed_text'] = df_train['text'].apply(data_preprocessing)
df_test['transformed_text'] = df_test['text'].apply(data_preprocessing)

In [16]:
display(df_train.head())
display(df_test.head())

Unnamed: 0,text,feeling,transformed_text
0,i am feeling completely overwhelmed i have two strategies that help me to feel grounded pour my heart out in my journal in the form of a letter to god and then end with a list of five things i am most grateful for,fear,feel completely overwhelmed strategy help feel ground pour heart journal form letter god end list thing grateful
1,i have the feeling she was amused and delighted,joy,feeling amuse delight
2,i was able to help chai lifeline with your support and encouragement is a great feeling and i am so glad you were able to help me,joy,able help chai lifeline support encouragement great feeling glad able help
3,i already feel like i fucked up though because i dont usually eat at all in the morning,anger,feel like fuck not usually eat morning
4,i still love my so and wish the best for him i can no longer tolerate the effect that bm has on our lives and the fact that is has turned my so into a bitter angry person who is not always particularly kind to the people around him when he is feeling stressed,sadness,love wish good long tolerate effect bm life fact turn bitter angry person particularly kind people feel stress


Unnamed: 0,text,feeling,transformed_text
0,i feel like my only role now would be to tear your sails with my pessimism and discontent,sadness,feel like role tear sail pessimism discontent
1,i feel just bcoz a fight we get mad to each other n u wanna make a publicity n let the world knows about our fight,anger,feel bcoz fight mad n u wanna publicity n let world know fight
2,i feel like reds and purples are just so rich and kind of perfect,joy,feel like red purple rich kind perfect
3,im not sure the feeling of loss will ever go away but it may dull to a sweet feeling of nostalgia at what i shared in this life with my dad and the luck i had to have a dad for years,sadness,m sure feeling loss away dull sweet feeling nostalgia share life dad luck dad year
4,i feel like ive gotten to know many of you through comments and emails and for that im appreciative and glad you are a part of this little space,joy,feel like ve get know comment email m appreciative glad little space


---

## Fine Tuning of Pre-Trained Transformer Model

We will use the data processed with SpaCy and then perform the specific processing for the BERT model, just as we did with the model in version 1.

https://huggingface.co/distilbert-base-multilingual-cased

In [18]:
# Function to encode the text into a sequence of integers for input to the BERT model
def encode(texts, tokenizer, chunk_size = 256, max_len = 512):

    # Enable truncation in the tokenizer to a specified max length
    tokenizer.enable_truncation(max_length = max_len)

    # Enable padding in th tokenizer to a specified max length
    tokenizer.enable_padding(length = max_len)

    # Initialize a list to stor the encoded IDs
    all_ids = []

    # Iterate over texts in chunk of size 'chunk_size'
    for i in tqdm(range(0, len(texts), chunk_size)):

        # Create a chunk of text
        text_chunk = texts[i:i+chunk_size].tolist()

        # Encode the chunk of text in batch
        encs = tokenizer.encode_batch(text_chunk)

        # Extend the list 'all_ids' with the encoded IDs
        all_ids.extend([enc.ids for enc in encs])

    # Return the IDs list as an array numpy
    return np.array(all_ids)


## Olhar aqui

In [19]:
# Load the tokenizer from the pretrained model 
tokenizer_bert = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

# Save the tokenizer and the vocabulary locally
tokenizer_bert.save_pretrained('.')

# Load a faster tokenizer using the vocabulary of main tokenizer 
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase = False)

# Show the tokenizer
fast_tokenizer

Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=False, wordpieces_prefix=##)

In [23]:
# Divide os dados em treino e validação com amostragem estratificada
X_train, X_valid, y_train, y_valid = train_test_split(df_train['transformed_text'].values,
                                                        df_train['feeling'].values,
                                                        test_size = 0.2,
                                                        random_state = 42,
                                                        stratify = df_train['feeling'])

Stratified sampling is a technique used in statistics to ensure that subgroups (or strata) of a population are adequately represented within a sample. It is particularly useful in situations where the population is heterogeneous and the subgroups have different characteristics that are important to the research.

In [52]:
# Max lenght used in the text
max_length = 100

# Applying the encode in our data, using the faster tokenizer
X_final_train = encode(X_train, fast_tokenizer, max_len = max_length)
X_final_valid = encode(X_valid, fast_tokenizer, max_len = max_length)
X_final_test = encode(df_test['transformed_text'].to_numpy(), fast_tokenizer, max_len = max_length)

X_final_train.shape

100%|██████████| 50/50 [00:00<00:00, 591.66it/s]
100%|██████████| 13/13 [00:00<00:00, 619.18it/s]
100%|██████████| 8/8 [00:00<00:00, 666.62it/s]


(12800, 100)

In [54]:
# Define the encoder of output data
le = LabelEncoder()

# Applying the label encoder (fit_transform only on train data)
y_train_le = le.fit_transform(y_train)
y_valid_le = le.transform(y_valid)
y_test_le = le.transform(df_test['feeling'])

# Convert the output variable to categorical
y_train_encoded = to_categorical(y_train_le)
y_valid_encoded = to_categorical(y_valid_le)
y_test_encoded = to_categorical(y_test_le)


```python
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertModel, DistilBertTokenizer
import torch.nn as nn
import torch.optim as optim
```

In [55]:
# Batch size
BATCH_SIZE = 16

# Prepare the dataset in the expcted format of Pytorch
train_dataset = TensorDataset(torch.tensor(X_final_train), torch.tensor(y_train_encoded))
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

valid_dataset = TensorDataset(torch.tensor(X_final_valid), torch.tensor(y_valid_encoded))
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(torch.tensor(X_final_test), torch.tensor(y_test_encoded))
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Creates an instance of the pre-trained, multilingual DistilBERT model suitable for use with PyTorch
transformer_model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')

In [68]:
class Model(nn.Module):
    def __init__(self, transformer, num_labels = 6):
        super(Model, self).__init__()
        self.transformer = transformer
        self.classifier = nn.Linear(transformer.config.hidden_size, num_labels)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids):
        # Getting the output of sequence of transformer
        outputs = self.transformer(input_ids)
        sequence_output = outputs.last_hidden_state
        
        # Selecting the fist token of each sequence (token CLS from BERT) to classification
        cls_token = sequence_output[:, 0, :]

        # Adding a dense layer to the output with softmax activation for classification
        logits = self.classifier(cls_token)
        out = self.softmax(logits)

        return out

In [69]:
model = Model(transformer=transformer_model)

for param in list(model.transformer.parameters())[:3]:
    param.requires_grad = False 

# Defines the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

# Model summary
print(model)

Model(
  (transformer): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [94]:
num_epochs = 3

train_losses = []
val_losses = []

# Train loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    # 
    all_preds = []
    all_labels = []

    # Progress_bar
    train_loader_tqdm = tqdm(train_loader, desc=f'Training - Epoch {epoch+1}/{num_epochs}')

    for batch in train_loader_tqdm:
        input_ids, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        # Converting predictions to classes (index)
        preds = torch.argmax(outputs, dim=1)
        train_labels = torch.argmax(labels, dim=1)

        # Store predictions and labels
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(train_labels.cpu().numpy())

        # Update progress bar with current average loss
        train_loader_tqdm.set_postfix({'Loss (batch)': loss.item()})

    # Average loss in training
    train_loss = running_loss / len(train_loader)
    # train_losses.append(train_loss)

    # Calculate training metrics
    train_accuracy = accuracy_score(y_train_le, all_preds)
    train_precision = precision_score(y_train_le, all_preds, average='weighted')
    train_recall = recall_score(y_train_le, all_preds, average='weighted')
    train_f1 = f1_score(y_train_le, all_preds, average='weighted')

    # Validation loop
    model.eval()
    val_running_loss = 0.0
    all_val_preds = []
    all_val_labels = []

    with torch.no_grad():
        for batch in valid_loader:
            input_ids, labels = batch
            outputs = model(input_ids)
            val_loss = criterion(outputs, labels)
            val_running_loss += val_loss.item()

            # Converting predictions to classes (index)
            val_labels = torch.argmax(labels, dim=1)
            val_preds = torch.argmax(outputs, dim=1)

            # Store predictions and labels
            all_val_preds.extend(val_preds.cpu().numpy())
            all_val_labels.extend(val_labels.cpu().numpy())
    # 
    val_loss_avg = val_running_loss / len(valid_loader)
    # val_losses.append(val_loss_avg)

    # Calculate validation metrics  
    val_accuracy = accuracy_score(all_val_labels, all_val_preds)
    val_precision = precision_score(all_val_labels, all_val_preds, average='weighted')
    val_recall = recall_score(all_val_labels, all_val_preds, average='weighted')
    val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted')

    # Print metrics after each epoch
    print(f"\nEpoch [{epoch+1}/{num_epochs}]")
    print(f"Loss of Training: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss_avg:.4f}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Validation Precision: {val_precision:.4f}")
    print(f"Validation Recall: {val_recall:.4f}")
    print(f"Validation F1-Score: {val_f1:.4f}\n")



Training - Epoch 1/3: 100%|██████████| 800/800 [09:15<00:00,  1.44it/s, Loss (batch)=1.29]



Epoch [1/3]
Loss of Training: 1.3304
Validation Loss: 1.2456
Training Accuracy: 0.2677
Validation Accuracy: 0.7963
Validation Precision: 0.7953
Validation Recall: 0.7963
Validation F1-Score: 0.7684



Training - Epoch 2/3: 100%|██████████| 800/800 [10:47<00:00,  1.24it/s, Loss (batch)=1.06]



Epoch [2/3]
Loss of Training: 1.2211
Validation Loss: 1.1936
Training Accuracy: 0.2619
Validation Accuracy: 0.8503
Validation Precision: 0.8501
Validation Recall: 0.8503
Validation F1-Score: 0.8434



Training - Epoch 3/3: 100%|██████████| 800/800 [10:31<00:00,  1.27it/s, Loss (batch)=1.24]



Epoch [3/3]
Loss of Training: 1.1792
Validation Loss: 1.1805
Training Accuracy: 0.2460
Validation Accuracy: 0.8619
Validation Precision: 0.8662
Validation Recall: 0.8619
Validation F1-Score: 0.8561



In [None]:
plt.plot(train_losses, label='Erro em Treino')
plt.plot(val_losses, label='Erro em Validação')
plt.legend()
plt.show()

In [98]:
# Model evaluation
model.eval()

# Converting X_final_test to a PyTorch tensor
X_test_final_tensor = torch.tensor(X_final_test)

# Predictions
with torch.no_grad():
    predictions = model(X_test_final_tensor)

# Predicted labels (choosing the class index with highest probability)
predicted_labels = torch.argmax(predictions, dim=1).numpy()


In [101]:
print(classification_report(y_test_le, predicted_labels))

print(confusion_matrix(y_test_le, predicted_labels))

print(accuracy_score(y_test_le, predicted_labels))

              precision    recall  f1-score   support

           0       0.88      0.84      0.86       275
           1       0.84      0.87      0.85       224
           2       0.86      0.95      0.90       695
           3       0.91      0.53      0.67       159
           4       0.88      0.92      0.90       581
           5       0.86      0.55      0.67        66

    accuracy                           0.87      2000
   macro avg       0.87      0.78      0.81      2000
weighted avg       0.87      0.87      0.86      2000

[[231   5  10   3  25   1]
 [  9 195   3   1  16   0]
 [  4   7 659   2  18   5]
 [  3   1  61  85   9   0]
 [ 15   8  24   2 532   0]
 [  1  17  10   0   2  36]]
0.869


## Saving the model

### Saving only weights

In [102]:
##Saving
# Path where the model will be saved
PATH = "models/model_v3.pth"

# Saving only model weights
torch.save(model.state_dict(), PATH)


## Loading
# Initializing the model (architecture must be the same as the saved model)
model = Model(transformer_model)

# Loaded the saved weights 
model.load_state_dict(torch.load(PATH))

# Put the model in evaluation mode if it is for inference
model.eval()

Model(
  (transformer): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

### Saving the complete model

In [103]:
torch.save(model, "models/model_v3_complete.pth")

# Loading the complete model
model = torch.load("models/model_v3_complete.pth")

# Putting the model in evaluation mode if it is for inference
model.eval()

Model(
  (transformer): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

## Prediction for new data

In [105]:
# New sentence
sentence = "I'm happy today"

# Create a dataframe with the sentence
df_new = pd.DataFrame({'text': [sentence]})

# Applying the preprocessing function
df_new['transformed_text'] = df_new['text'].apply(data_preprocessing)

new_data = encode(df_new['transformed_text'], fast_tokenizer, max_len = max_length)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 66.67it/s]


In [106]:
# Converting new_data to a PyTorch tensor if it isn't already
new_data_tensor = torch.tensor(new_data)

# Prediction
with torch.no_grad():
    prediction = model(new_data_tensor)

# Predicted labels (choosing the class index with highest probability)
predicted_label = torch.argmax(prediction, dim=1).numpy()

# Get the class name
class_name = le.inverse_transform(predicted_label)
print(class_name)

['joy']


### The end