# Transfer Learning for Sentiment Analysis using Transformers with Distil Bert Multilingual Cased

In [1]:
import math
import pandas as pd
import numpy as np
import warnings
import spacy
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


import transformers
from tokenizers import BertWordPieceTokenizer

import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertModel, DistilBertTokenizer
import torch.nn as nn
import torch.optim as optim

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.cuda.is_available():
    print(f"CUDA is available! GPU name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. Running on CPU.")

CUDA is available! GPU name: NVIDIA GeForce RTX 4060 Ti


## Load data

In [3]:
df_train = pd.read_csv('data/train_data.txt', header=None, delimiter=';')
df_test = pd.read_csv('data/test_data.txt', header=None, delimiter=';')

df_train = df_train.rename(columns= {0: 'text', 1: 'feeling'})
df_test = df_test.rename(columns= {0: 'text', 1: 'feeling'})

print(df_train.shape)
print(df_test.shape)

display(df_train.head(3))
display(df_test.head(3))

(16000, 2)
(2000, 2)


Unnamed: 0,text,feeling
0,i am feeling completely overwhelmed i have two strategies that help me to feel grounded pour my heart out in my journal in the form of a letter to god and then end with a list of five things i am most grateful for,fear
1,i have the feeling she was amused and delighted,joy
2,i was able to help chai lifeline with your support and encouragement is a great feeling and i am so glad you were able to help me,joy


Unnamed: 0,text,feeling
0,i feel like my only role now would be to tear your sails with my pessimism and discontent,sadness
1,i feel just bcoz a fight we get mad to each other n u wanna make a publicity n let the world knows about our fight,anger
2,i feel like reds and purples are just so rich and kind of perfect,joy


In [4]:
df_train['feeling'].value_counts()

feeling
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [5]:
df_test['feeling'].value_counts()

feeling
joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: count, dtype: int64

> The column **text** will be the input feature and **feeling** will be the output target.

In [6]:
!pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [8]:
# Ensure required NLTK data is downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krupc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\krupc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krupc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
def custom_nlp(text):
    # Tokenize the text
    tokens = re.findall(r'\b\w+\b', text.lower())  # Extract words, ignoring punctuation
    # Remove stopwords, remove punctuation, and lemmatize
    processed_tokens = [
        lemmatizer.lemmatize(token.strip()) for token in tokens 
        if token not in stop_words and token.isalnum()  # Check for alphanumeric tokens
    ]
    # Join tokens into a single string
    return ' '.join(processed_tokens)

In [19]:
df_train['transformed_text_nltk'] = df_train['text'].apply(custom_nlp)
df_test['transformed_text_nltk'] = df_test['text'].apply(custom_nlp)

In [20]:
df_train

Unnamed: 0,text,feeling,transformed_text_nltk,transformed_text
0,i am feeling completely overwhelmed i have two strategies that help me to feel grounded pour my heart out in my journal in the form of a letter to god and then end with a list of five things i am most grateful for,fear,feeling completely overwhelmed two strategy help feel grounded pour heart journal form letter god end list five thing grateful,feel completely overwhelmed strategy help feel grounded pour heart journal form letter god end list thing grateful
1,i have the feeling she was amused and delighted,joy,feeling amused delighted,feeling amuse delight
2,i was able to help chai lifeline with your support and encouragement is a great feeling and i am so glad you were able to help me,joy,able help chai lifeline support encouragement great feeling glad able help,able help chai lifeline support encouragement great feeling glad able help
3,i already feel like i fucked up though because i dont usually eat at all in the morning,anger,already feel like fucked though dont usually eat morning,feel like fuck not usually eat morning
4,i still love my so and wish the best for him i can no longer tolerate the effect that bm has on our lives and the fact that is has turned my so into a bitter angry person who is not always particularly kind to the people around him when he is feeling stressed,sadness,still love wish best longer tolerate effect bm life fact turned bitter angry person always particularly kind people around feeling stressed,love wish good long tolerate effect bm life fact turn bitter angry person particularly kind people feel stress
...,...,...,...,...
15995,i just had a very brief time in the beanbag and i said to anna that i feel like i have been beaten up,sadness,brief time beanbag said anna feel like beaten,brief time beanbag say anna feel like beat
15996,i am now turning and i feel pathetic that i am still waiting tables and subbing with a teaching degree,sadness,turning feel pathetic still waiting table subbing teaching degree,turn feel pathetic wait table sub teaching degree
15997,i feel strong and good overall,joy,feel strong good overall,feel strong good overall
15998,i feel like this was such a rude comment and im glad that t,anger,feel like rude comment im glad,feel like rude comment m glad t


## Data preprocessing using Spacy

**Spacy** is an open-source library for advanced Natural Language Processing (NLP) in Python, designed specifically for production use with a focus on performance and ease of use. It provides fast and efficient processing, making it suitable for large datasets and real-time applications. The library includes pre-trained models for various languages, offering capabilities such as tokenization, part-of-speech tagging, named entity recognition, dependency parsing, lemmatization, and text classification. 

The user-friendly API allows you to implement complex NLP tasks with minimal code, and it integrates seamlessly with other libraries like TensorFlow and PyTorch. Additionally, spaCy supports the creation of custom pipelines and model training, ensuring flexibility for specific project needs. Its active community, comprehensive documentation, and ongoing development make it a reliable choice for anyone looking to incorporate robust NLP functionalities into their projects. Overall, spaCy is ideal for applications that require efficient text processing and analysis, making it a popular choice among data scientists and developers. [Oficial site](https://spacy.io/).

In [None]:
!python -m spacy download en_core_web_md -q

In [14]:
# Load the dict
spacy_nlp = spacy.load('en_core_web_md')

In [15]:
# Definition of the 'data_preprocessing' function that receives a text as a parameter
def data_preprocessing(text):

    # Process the text using the dictionary
    doc = spacy_nlp(text)

    # Creates a list of lemmas from the tokens, converted to lowercase and without whitespace,
    # excluding words that are stopwords
    tokens = [token.lemma_.lower().strip() for token in doc if not token.is_stop]

    # Returns the processed tokens as a single string, joining them with spaces
    return ' '.join(tokens)

In [16]:
df_train['transformed_text'] = df_train['text'].apply(data_preprocessing)
df_test['transformed_text'] = df_test['text'].apply(data_preprocessing)

In [None]:
display(df_train.head())
display(df_test.head())

## HuggingFace Platform

Hugging Face is a prominent AI research organization and platform that focuses on natural language processing (NLP) and machine learning. It is best known for its user-friendly libraries and tools that facilitate the development, training, and deployment of state-of-the-art machine learning models, particularly those based on transformer architectures. Here are some key aspects of the Hugging Face platform:

**Key Features**

1. **Transformers Library**:
   - The `Transformers` library provides access to a wide variety of pre-trained models for NLP tasks, such as text classification, translation, summarization, and named entity recognition. The library supports models like BERT, GPT-2, RoBERTa, T5, and many more.

2. **Datasets Library**:
   - Hugging Face offers the `Datasets` library, which provides easy access to a large collection of datasets for machine learning tasks. This library simplifies loading, processing, and sharing datasets across various projects.

3. **Tokenizers Library**:
   - The `Tokenizers` library is designed for efficient tokenization, providing tools to preprocess text data for various NLP tasks. It supports multiple tokenization algorithms, including Byte Pair Encoding (BPE) and WordPiece.

4. **Model Hub**:
   - Hugging Face hosts a Model Hub where users can find, share, and use thousands of pre-trained models contributed by the community and organizations. Users can easily download and integrate these models into their applications.

5. **Training and Fine-tuning**:
   - The platform provides straightforward APIs for training and fine-tuning models on custom datasets, making it accessible even for those who may not have extensive experience in machine learning.

6. **Infrastructural Support**:
   - Hugging Face offers tools like `Trainer` and `Pipeline` that simplify the process of model training, evaluation, and inference, reducing the complexity of setting up and managing experiments.

7. **Community and Collaboration**:
   - The Hugging Face community is active and supportive, offering forums, documentation, and tutorials to help users get started with machine learning and NLP.

8. **Integration with Other Frameworks**:
   - Hugging Face’s libraries can easily be integrated with popular deep learning frameworks like TensorFlow and PyTorch, allowing users to leverage the strengths of both ecosystems.

**Why Use Hugging Face?**

- **Accessibility**: The user-friendly APIs and extensive documentation make it easy for both beginners and experienced practitioners to work with advanced NLP models.
- **Cutting-edge Models**: Hugging Face keeps its model library updated with the latest advancements in NLP, allowing users to experiment with state-of-the-art techniques.
- **Community-driven**: The collaborative nature of Hugging Face encourages sharing and learning, making it a rich resource for researchers and developers alike.
- **Rapid Development**: With pre-trained models and streamlined tools, Hugging Face accelerates the development process for NLP applications, reducing time-to-market.

**Conclusion**

Overall, Hugging Face has established itself as a leading platform in the field of NLP and machine learning, providing powerful tools and resources that empower developers and researchers to create innovative AI applications. Whether you are working on a research project, a commercial application, or simply exploring NLP, Hugging Face offers the tools to help you succeed.

### Fine Tuning of Pre-Trained Transformer Model using DistilBERT Base Multilingual Cased

We will use the data processed with SpaCy and then perform the specific processing for the BERT model, just as we did with the model in version 1.

DistilBERT Base Multilingual Cased is a transformer-based language model developed by Hugging Face that is designed for multilingual natural language processing tasks. It is a distilled version of the BERT (Bidirectional Encoder Representations from Transformers) model, optimized to be smaller, faster, and more efficient while retaining a significant portion of BERT's performance. 

This model is trained on multiple languages, allowing it to handle over 100 languages, including English, Spanish, French, and German. The "cased" designation means that the model distinguishes between uppercase and lowercase letters, which is important for languages where case sensitivity matters.

DistilBERT Base Multilingual Cased can be used for various NLP tasks such as text classification, named entity recognition, question answering, and sentiment analysis. It is approximately 60% smaller than BERT but retains about 97% of its performance on language understanding tasks, making it an attractive option for developers seeking efficient models without sacrificing accuracy. The model can be easily integrated into applications using the Hugging Face Transformers library, providing a versatile tool for multilingual text processing.

https://huggingface.co/distilbert-base-multilingual-cased

In [None]:
# Function to encode the text into a sequence of integers for input to the BERT model
def encode(texts, tokenizer, chunk_size = 256, max_len = 512):

    # Enable truncation in the tokenizer to a specified max length
    tokenizer.enable_truncation(max_length = max_len)

    # Enable padding in th tokenizer to a specified max length
    tokenizer.enable_padding(length = max_len)

    # Initialize a list to stor the encoded IDs
    all_ids = []

    # Iterate over texts in chunk of size 'chunk_size'
    for i in tqdm(range(0, len(texts), chunk_size)):

        # Create a chunk of text
        text_chunk = texts[i:i+chunk_size].tolist()

        # Encode the chunk of text in batch
        encs = tokenizer.encode_batch(text_chunk)

        # Extend the list 'all_ids' with the encoded IDs
        all_ids.extend([enc.ids for enc in encs])

    # Return the IDs list as an array numpy
    return np.array(all_ids)


In [None]:
# Load the tokenizer from the pretrained model 
tokenizer_bert = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

# Save the tokenizer and the vocabulary locally
tokenizer_bert.save_pretrained('.')

# Load a faster tokenizer using the vocabulary of main tokenizer 
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase = False)

# Show the tokenizer
fast_tokenizer

In [None]:
# Data splitting
X_train, X_valid, y_train, y_valid = train_test_split(df_train['transformed_text'].values,
                                                        df_train['feeling'].values,
                                                        test_size = 0.2,
                                                        random_state = 42,
                                                        stratify = df_train['feeling'])

Stratified sampling is a technique used in statistics to ensure that subgroups (or strata) of a population are adequately represented within a sample. It is particularly useful in situations where the population is heterogeneous and the subgroups have different characteristics that are important to the research.

In [None]:
# Max lenght used in the text
max_length = 100

# Applying the encode in our data, using the faster tokenizer
X_final_train = encode(X_train, fast_tokenizer, max_len = max_length)
X_final_valid = encode(X_valid, fast_tokenizer, max_len = max_length)
X_final_test = encode(df_test['transformed_text'].to_numpy(), fast_tokenizer, max_len = max_length)

X_final_train.shape

In [None]:
def to_categorical(y, num_classes=None, dtype='float32'):
    # Converts y into an array of numpy
    y = np.array(y, dtype='int')
    
    # If the number of classes is not given, it will be determined from y
    if not num_classes:
        num_classes = np.max(y) + 1
    
    # Initialize the output array with zeros
    categorical = np.zeros((y.shape[0], num_classes), dtype=dtype)
    
    # Fill the matrix with 1 in the position corresponding to the class of each label
    categorical[np.arange(y.shape[0]), y] = 1
    
    return categorical


In [None]:
# Define the encoder of output data
le = LabelEncoder()

# Applying the label encoder (fit_transform only on train data)
y_train_le = le.fit_transform(y_train)
y_valid_le = le.transform(y_valid)
y_test_le = le.transform(df_test['feeling'])

# Convert the output variable to categorical
y_train_encoded = to_categorical(y_train_le)
y_valid_encoded = to_categorical(y_valid_le)
y_test_encoded = to_categorical(y_test_le)


---

## Data preprocessing for Pytorch

This code prepares datasets for training, validation, and testing using PyTorch’s `DataLoader` and a pre-trained multilingual DistilBERT model. Here's a breakdown of each part:

1. **Batch Size Setting**
   The batch size for model training and evaluation is set to 16, meaning the model will process data in groups of 16 samples at a time.

2. **Preparing the Training Dataset and DataLoader**
   - `TensorDataset` is used to create a PyTorch dataset from the training features (`X_final_train`) and training labels (`y_train_encoded`). Both `X_final_train` and `y_train_encoded` are converted to tensors for compatibility with PyTorch.
   - `DataLoader` wraps the `train_dataset`, allowing it to be loaded in batches. The `shuffle=True` argument shuffles the data at the start of each epoch, which helps improve training by reducing overfitting.

3. **Preparing the Validation Dataset and DataLoader**
   - This section creates a validation dataset and loader in the same way as the training data, but without shuffling (`shuffle=False`). The `valid_loader` will be used to evaluate model performance after each training epoch.

4. **Preparing the Test Dataset and DataLoader**
   - Similar to the training loader, this code prepares the test dataset and loader with shuffling enabled. The `test_loader` will allow for evaluation on the test set after training completes.

5. **Loading the Pre-Trained DistilBERT Model**
   - Here, an instance of the `DistilBertModel` is created using the pre-trained multilingual version (`distilbert-base-multilingual-cased`) from Hugging Face. This model will be further fine-tuned on the specific task defined by the prepared datasets.

This code structure allows efficient training, validation, and testing of a model, while the pre-trained DistilBERT model provides a strong starting point for handling multilingual data.

In [None]:
# Batch size
BATCH_SIZE = 16

# Prepare the dataset in the expected format of Pytorch
train_dataset = TensorDataset(torch.tensor(X_final_train), torch.tensor(y_train_encoded))
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

valid_dataset = TensorDataset(torch.tensor(X_final_valid), torch.tensor(y_valid_encoded))
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(torch.tensor(X_final_test), torch.tensor(y_test_encoded))
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Creates an instance of the pre-trained, multilingual DistilBERT model suitable for use with PyTorch
transformer_model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')

### Creating a Custom Pytorch model

This code defines a custom PyTorch model class `Model` that uses a pre-trained transformer model (like DistilBERT) as its base and adds a classification layer to output predictions. Here's a breakdown of each part:

1. **Class Definition and Initialization**
   - The `Model` class inherits from `nn.Module`, the base class for all neural network modules in PyTorch.
   - The `__init__` method initializes the model, accepting a pre-trained transformer (like DistilBERT) and setting `num_labels` as the number of output classes (in this case, 6).
   - The `transformer` layer stores the pre-trained transformer model for feature extraction.
   - The `classifier` layer is a fully connected (dense) layer that takes the transformer model's output size (`hidden_size`) and maps it to the number of classes specified by `num_labels`.
   - The `softmax` layer applies the softmax function along dimension 1 (the class dimension), converting the raw scores (logits) from the classifier into probabilities.

2. **Forward Pass**
   - The `forward` method defines the forward pass, taking `input_ids` as input (these are tokenized text sequences).
   - `self.transformer(input_ids)` passes the input sequences through the transformer, producing the `last_hidden_state` for each token in each sequence.
   - The code then selects the first token embedding from `sequence_output` (often representing the `[CLS]` token in BERT-based models), which typically contains the aggregate information for the sequence.

3. **Classification and Output**
   - `self.classifier(cls_token)` passes the `[CLS]` token embedding through the classifier layer, producing raw scores (logits) for each class.
   - `self.softmax(logits)` converts these logits into probabilities across the classes.
   - The final output `out` contains probabilities for each class, allowing the model to make a classification prediction.

In summary, this code defines a model that uses a transformer to process input text and applies a dense layer with a softmax activation to classify the text into one of the six specified classes.

In [None]:
class Model(nn.Module):
    def __init__(self, transformer, num_labels = 6):
        super(Model, self).__init__()
        self.transformer = transformer
        self.classifier = nn.Linear(transformer.config.hidden_size, num_labels)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids):
        # Getting the output of sequence of transformer
        outputs = self.transformer(input_ids)
        sequence_output = outputs.last_hidden_state
        
        # Selecting the fist token of each sequence (token CLS from BERT) to classification
        cls_token = sequence_output[:, 0, :]

        # Adding a dense layer to the output with softmax activation for classification
        logits = self.classifier(cls_token)
        out = self.softmax(logits)

        return out

### Defining Hyperparameters

This code initializes the custom `Model` class with a pre-trained transformer, selectively freezes some of the transformer's parameters, sets up the optimizer and loss function, and prints a summary of the model structure. Here’s a breakdown:

1. **Model Initialization**
   - An instance of the `Model` class is created, with `transformer_model` (e.g., DistilBERT) passed as the `transformer` parameter.
   - This wraps the pre-trained transformer in the custom model, which includes additional layers for classification.

2. **Freezing Transformer Parameters**
   - The loop iterates over the first three parameters of the `transformer` within the `model`, setting `requires_grad` to `False`.
   - Setting `requires_grad = False` prevents these parameters from being updated during training, effectively "freezing" them. This can be useful when using transfer learning to retain some of the pre-trained transformer weights while reducing the computational load.

3. **Defining the Optimizer and Loss Function**
   - The `optimizer` is set to Adam, a commonly used optimization algorithm for training neural networks. The learning rate (`lr`) is set to \(1 \times 10^{-5}\), which is often appropriate for fine-tuning a pre-trained transformer.
   - The `criterion` or loss function is set to `CrossEntropyLoss`, which is commonly used for multi-class classification tasks. It calculates the difference between the predicted and actual class labels and helps guide the model to improve predictions during training.

4. **Printing Model Summary**
   - This line prints a summary of the model architecture, including the layers within the `Model` class and the pre-trained transformer. This summary provides insights into the model’s structure, parameters, and number of layers, which can help in debugging and verifying the model setup.

In summary, this code sets up the model for transfer learning by partially freezing the transformer parameters, defining the optimizer and loss function for training, and printing a summary to verify the model’s configuration.

In [None]:
model = Model(transformer=transformer_model)

for param in list(model.transformer.parameters())[:3]:
    param.requires_grad = False 

# Defines the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

# Model summary
print(model)

### Training the model

This code trains a PyTorch model over multiple epochs using GPU acceleration (if available) and evaluates it on a validation dataset after each epoch. It tracks training and validation losses, along with metrics like accuracy and precision. Here’s a step-by-step explanation:

1. **Epoch Setup and Device Configuration**
   - The number of epochs is set to 10.
   - `train_losses` and `val_losses` lists store the average loss for each epoch in training and validation.
   - The `device` is set to GPU (`cuda`) if available; otherwise, it defaults to CPU.
   - The model is moved to the chosen device.

2. **Training Loop**
   - The loop iterates over each epoch, setting the model to training mode with `model.train()`.
   - `running_loss` keeps track of the cumulative loss within each epoch.
   - `train_loader_tqdm` uses `tqdm` to display a progress bar for each epoch.

3. **Batch Processing and Backpropagation**
   - Each batch of input data (`input_ids` and `labels`) is moved to the appropriate device.
   - `optimizer.zero_grad()` resets gradients to prevent accumulation.
   - The model computes predictions (`outputs`), and the loss is calculated using `criterion`.
   - `loss.backward()` performs backpropagation to compute gradients, followed by `optimizer.step()` to update the model weights.
   - `running_loss` accumulates the batch loss to calculate the epoch’s average loss.

4. **Prediction and Tracking Loss**
   - `preds` represents the predicted class indices, obtained by taking the `argmax` of `outputs` along the class dimension.
   - Predictions are stored on the CPU for further analysis.
   - The progress bar updates with the current batch loss.

5. **Epoch-Level Training Loss Calculation**
   - The average loss for the epoch is calculated by dividing `running_loss` by the number of batches (`len(train_loader)`).
   - `train_loss` is stored for tracking training progress.

6. **Validation Loop**
   - The model is set to evaluation mode (`model.eval()`), and gradient computation is disabled (`torch.no_grad()`).
   - Similar to training, each batch is passed through the model to calculate predictions and the validation loss.
   - `val_running_loss` accumulates the loss across validation batches.

7. **Validation Predictions and Metrics Calculation**
   - `val_labels` and `val_preds` are obtained by taking the `argmax` of `labels` and `outputs`.
   - Predictions and actual labels are stored for accuracy and precision calculation.

8. **Calculating and Printing Metrics**
   - `val_loss_avg` calculates the average validation loss.
   - `val_accuracy` and `val_precision` compute accuracy and weighted precision on the validation set using the stored predictions and labels.
   - Each epoch’s metrics are printed to track the model’s training and validation progress. 

This code provides an efficient way to train and evaluate the model, enabling tracking of key performance metrics across multiple epochs.

In [None]:
num_epochs = 10

train_losses = []
val_losses = []

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move the model to GPU
model.to(device)


# Train loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    all_preds = []
    all_labels = []

    # Progress_bar
    train_loader_tqdm = tqdm(train_loader, desc=f'Training - Epoch {epoch+1}/{num_epochs}')

    for batch in train_loader_tqdm:
        input_ids, labels = batch
        # Move to GPU
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        # Converting predictions to classes (index)
        preds = torch.argmax(outputs, dim=1)

        # Store predictions and labels
        all_preds.extend(preds.cpu().numpy())

        # Update progress bar with current average loss
        train_loader_tqdm.set_postfix({'Loss (batch)': loss.item()})

    # Average loss in training
    train_loss = running_loss / len(train_loader)
    train_losses.append(train_loss)

    # Validation loop
    model.eval()
    val_running_loss = 0.0
    all_val_preds = []
    all_val_labels = []

    with torch.no_grad():
        for batch in valid_loader:
            input_ids, labels = batch
            # Move to GPU
            input_ids = input_ids.to(device)
            labels = labels.to(device)
            
            outputs = model(input_ids)
            val_loss = criterion(outputs, labels)
            val_running_loss += val_loss.item()

            # Converting predictions to classes (index)
            val_labels = torch.argmax(labels, dim=1)
            val_preds = torch.argmax(outputs, dim=1)

            # Store predictions and labels
            all_val_preds.extend(val_preds.cpu().numpy())
            all_val_labels.extend(val_labels.cpu().numpy())
    # 
    val_loss_avg = val_running_loss / len(valid_loader)
    val_losses.append(val_loss_avg)

    # Calculate validation metrics  
    val_accuracy = accuracy_score(all_val_labels, all_val_preds)
    val_precision = precision_score(all_val_labels, all_val_preds, average='weighted')

    # Print metrics after each epoch
    print(f"\nEpoch [{epoch+1}/{num_epochs}]")
    print(f"Loss of Training: {train_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Validation Precision: {val_precision:.4f}")



## Model Evaluation

In [None]:
plt.plot(train_losses, label='Training Error')
plt.plot(val_losses, label='Validation Error')
plt.legend()
plt.show()

In [None]:
# Model evaluation
model.eval()

# Converting X_final_test to a PyTorch tensor
X_test_final_tensor = torch.tensor(X_final_test).to(device)

# Predictions
with torch.no_grad():
    predictions = model(X_test_final_tensor)

# Predicted labels (choosing the class index with highest probability)
predicted_labels = torch.argmax(predictions, dim=1).cpu().numpy()


In [None]:
print(classification_report(y_test_le, predicted_labels))

print(confusion_matrix(y_test_le, predicted_labels))

print(accuracy_score(y_test_le, predicted_labels))

## Saving the model

### Saving only weights

In [None]:
##Saving
# Path where the model will be saved
PATH = "models/model_v3.pth"

# Saving only model weights
torch.save(model.state_dict(), PATH)


## Loading
# Initializing the model (architecture must be the same as the saved model)
model = Model(transformer_model)

# Loaded the saved weights 
model.load_state_dict(torch.load(PATH))

# Put the model in evaluation mode if it is for inference
model.eval()

### Saving the complete model

In [None]:
torch.save(model, "models/model_v3_complete.pth")

# Loading the complete model
model = torch.load("models/model_v3_complete.pth")

# Putting the model in evaluation mode if it is for inference
model.eval()

## Prediction for new data

In [None]:
# New sentence
sentence = "I'm happy today"

# Create a dataframe with the sentence
df_new = pd.DataFrame({'text': [sentence]})

# Applying the preprocessing function
df_new['transformed_text'] = df_new['text'].apply(data_preprocessing)

new_data = encode(df_new['transformed_text'], fast_tokenizer, max_len = max_length)

In [None]:
# Converting new_data to a PyTorch tensor if it isn't already
new_data_tensor = torch.tensor(new_data).to(device)
model = model.to(device)

# Prediction
with torch.no_grad():
    prediction = model(new_data_tensor)

# Predicted labels (choosing the class index with highest probability)
predicted_label = torch.argmax(prediction, dim=1).cpu().numpy()

# Get the class name
class_name = le.inverse_transform(predicted_label)
print(class_name)

In [None]:
# Releases any unused memory held by PyTorch’s CUDA memory allocator.
torch.cuda.empty_cache()

### The end