# The purpose of this notebook is to use examined libraries for text embedding generation in a text classification task

In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.3.1


In [2]:
import os
import warnings
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchvision import models
from transformers import (
    get_cosine_schedule_with_warmup,
    AutoTokenizer,
    AutoModel,
    DataCollatorWithPadding
)
from sklearn.metrics import mean_absolute_error,f1_score
from sklearn.model_selection import KFold, ParameterGrid
from datasets import load_dataset
import torch.amp
import torch.optim
warnings.simplefilter(action='ignore', category=FutureWarning)
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text  
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import torch.amp
import torch.optim
from huggingface_hub.utils import are_progress_bars_disabled, disable_progress_bars, enable_progress_bars
disable_progress_bars()
import matplotlib.pyplot as plt

**Load dataset**

In [3]:
df = pd.read_csv("lng_final_dataset.csv")

**Lets split data to train and test set**

In [4]:
train_df, test_df = train_test_split(df, test_size=0.2)

**Now we load the libraries**

In [5]:
models = ['LaBSE',
          'mUSE']

In [6]:
mUSE = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/multilingual-large/2")

In [7]:
labse_mdl = SentenceTransformer('sentence-transformers/LaBSE')

In [8]:
class CustomDataset(Dataset):
    '''
    This class provides the dataset for the classification model.
    '''
    def __init__(self,  dataset:pd.DataFrame, tokenizer_name:str,text_col:str, label_col:str, sample=None):
        texts = dataset
        if sample:
            texts = texts.sample(sample)
        
        
        self.texts = texts[text_col].values.tolist()
        self.labels = texts[label_col].values.tolist()
        self.tokenizer_name = tokenizer_name

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        res = {}
        res['text'] = self.texts[idx]
        res['label'] = self.labels[idx]
        return res
        
        


In [9]:
def embed_text(m:str,text:list):
    '''
    Computes embedding for provided text

    m
    ----
        string that defines the library
    text
    ----
        list of texts for tokenization 
    
    Returns
    -------
        pytroch.Tensor
    '''
    
    if m == 'mUSE':
        res = torch.Tensor(mUSE(tf.constant(text)).numpy())
            
    if m == 'LaBSE':
        res = torch.Tensor(labse_mdl.encode(text, show_progress_bar=False))
    
    return res

In [10]:
class CustomModel(nn.Module):
    '''
    This class handels the classification model.
    The model consists of one linear layer with dropout and one output layer.
    '''

    def __init__(self,backbone:str,drop_out:float):
        '''
        The constructor defines the internal dimensions of the model and the inside layers.

        backbone
        --------
            string that defines the name of the library which declares the in dimension
        drop_out
        -------
            float that defines the dropout rate of dropout layer
        '''
        super(CustomModel, self).__init__()
        self.dropout = drop_out
        self.input_dim = 768 if backbone != 'mUSE' else 512
        self.inner_dim = 64
        self.output_dim = 3
        
        #layers
        self.fc1 = nn.Linear(self.input_dim, self.inner_dim)
        self.dropout = nn.Dropout(drop_out)
        self.relu = nn.ReLU()
        self.output = nn.Linear(self.inner_dim, self.output_dim)
       
        
    def forward(self, x):
        '''
        Passes the data trough the network
        x
        ----
            input data
        '''
        
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        output = self.output(x)
        return output

In [11]:
def train_loop(train_dl:DataLoader, model:CustomModel, loss_fn, optimizer, scheduler, device:str, m,log_every=1):
    '''
    Trains the model and performs backpropagation.
    '''

    
    losses = []
    lr_values = []
    size = len(train_dl)
    model.train()
    scaler = torch.cuda.amp.GradScaler()
    for batch_idx,batch in enumerate(train_dl):
        x = batch['text']
        y = batch['label'].to(device)
        with torch.no_grad():  
            x = embed_text(m,x).to(device)
        with torch.autocast(device_type=device, dtype=torch.float16):

            pred = model(x)   
            loss = loss_fn(pred, y)
            
        # Backpropagation
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scheduler.step()
        scaler.update()
        optimizer.zero_grad()
        if batch_idx % log_every == 0:
            losses.append(loss.detach().cpu().item())
            lr_values.append(scheduler.get_lr()[0])

        loss, current = loss.item(), (batch_idx + 1)
        #pbar.set_description(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    return losses, lr_values

In [12]:
def inference_loop(model:CustomModel, dataloader:DataLoader, device:str,m:str, **predict_kwargs):

    '''
    Performs the inference of the model
    '''
    
    model.to(device)
    predictions = []
    labels = []
    
    model.eval()
   
    with torch.no_grad():

        for batch in dataloader:
            x = batch['text']
            y = batch['label'].to(device)  
            x = embed_text(m,x).to(device)
            
            pred = model(x,  **predict_kwargs)
            pred_lbls = torch.argmax(pred, dim=1)
            predictions.extend(pred_lbls.cpu().tolist())
            
            if y is not None:
                labels.extend(y.cpu().tolist())
    return predictions, labels

In [13]:
def validation_loop(val_dl:DataLoader, model:CustomModel, val_metrics, device:str,m:str):
    '''
    Computes the validation score for the model.
    
    '''
    predictions, labels = inference_loop(model, val_dl, device,m)
    scores = {metric.__name__: metric(labels, predictions,average='micro') for metric in val_metrics}
    return scores

In [14]:
def train(model : CustomModel, train_dl:DataLoader, val_dl:DataLoader, loss, optimizer, scheduler, epochs, val_metrics, device,m):
    '''
    Trains the model for all of the epochs and computes scores and losses. 
    '''
    scores = []
    losses = []
    lr_rates = []
    model.to(device)

    for epoch in range(epochs):

        print(f"Epoch {epoch+1}/{epochs}\n-------------------------------")
        epoch_losses, epoch_lr_rates = train_loop(train_dl, model, loss, optimizer, scheduler, device,m)
        epoch_scores = validation_loop(val_dl, model, val_metrics, device,m)
        print(epoch_scores)
        print(f"Loss {np.mean(epoch_losses)}")
        scores.append(epoch_scores)
        losses.extend(epoch_losses)
        lr_rates.extend(epoch_lr_rates)
        
    print("Done!")
    return model, scores, losses, lr_rates


In [15]:
def predict(model:CustomModel, test_dl:DataLoader, device:str,m:str):
    '''
    Only predicts the classes provided the input data.
    '''
    model.to(device)
    predictions, _ = inference_loop(model, test_dl, device,m)
    return predictions

In [16]:
def train_only(model:CustomModel, train_dl:DataLoader, loss, optimizer, scheduler, epochs, val_metrics, device,m):
    '''
    Serves for final training on the whole dataset.
    '''
    scores = []
    losses = []
    lr_rates = []
    model.to(device)

    for epoch in range(epochs):

        print(f"Epoch {epoch+1}/{epochs}\n-------------------------------")
        epoch_losses, epoch_lr_rates = train_loop(train_dl, model, loss, optimizer, scheduler, device,m)
        print(f"Loss {np.mean(epoch_losses)}")
        losses.extend(epoch_losses)
        lr_rates.extend(epoch_lr_rates)

    print("Done!")
    return model

In [17]:
models = ['LaBSE',
          'mUSE']

In [18]:
param_comb = ParameterGrid({'learning_rate' :[1e-1,1e-2],'dropout':[0.2,0.5],'epochs':[5]})

In [19]:
batch_size=64
device = "cuda" if torch.cuda.is_available() else 'cpu'
val_metrics = [f1_score]
loss = nn.CrossEntropyLoss()
sample = None

In [20]:
def perform_training(models:list,train_df:pd.DataFrame)->list:
    '''
    Trains the model and get the right hyperparameters for final evaluation.

    models
    ------
        list of embedding libraries to be used in classification
    train_df
    --------
        pandas.DataFrame that stores the training data
    
    Returns
    -------
        list of the best hyperparameters per model
    '''
    
    best_val_per_model = []
    
    for m in models:
        val_per_model = []
        for params in param_comb:
            print(str(params))
            epochs = params['epochs']
            
            
            tr_df, val_df = train_test_split(train_df, test_size=0.2)
            
            train_ds = CustomDataset(tr_df, m,"text", "label")
            val_ds = CustomDataset(val_df, m,"text", "label")
            train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
            
            val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=True)
            
            model =CustomModel(m,drop_out=params['dropout'])
            
            optimizer = torch.optim.AdamW(model.parameters(), lr=params['learning_rate'])
            scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=epochs * len(train_dl))
            model, val_scores, train_losses, lr_rates = train(model, train_dl, val_dl, loss, optimizer, scheduler, epochs, val_metrics, device,m)
            val_per_model.append(val_scores[len(val_scores) -1]['f1_score'])

        best_idx = np.argmax(val_per_model)
        best_score = (m,param_comb[best_idx])
        print(str(best_score))
        best_val_per_model.append(best_score)
    return best_val_per_model


In [21]:
best_val_per_model = perform_training(models, train_df)

{'dropout': 0.2, 'epochs': 5, 'learning_rate': 0.1}
Epoch 1/5
-------------------------------
{'f1_score': 0.6114206128133705}
Loss 0.8918076078097026
Epoch 2/5
-------------------------------
{'f1_score': 0.6176880222841226}
Loss 0.8099873787826962
Epoch 3/5
-------------------------------
{'f1_score': 0.6302228412256268}
Loss 0.7699485262235005
Epoch 4/5
-------------------------------
{'f1_score': 0.6364902506963789}
Loss 0.7289984378549788
Epoch 5/5
-------------------------------
{'f1_score': 0.6371866295264624}
Loss 0.7051521903938718
Done!
{'dropout': 0.2, 'epochs': 5, 'learning_rate': 0.01}
Epoch 1/5
-------------------------------
{'f1_score': 0.6434540389972145}
Loss 0.8288327078024547
Epoch 2/5
-------------------------------
{'f1_score': 0.649025069637883}
Loss 0.7399081150690715
Epoch 3/5
-------------------------------
{'f1_score': 0.6636490250696379}
Loss 0.6988914628823598
Epoch 4/5
-------------------------------
{'f1_score': 0.6657381615598886}
Loss 0.6680390901035733

  best_score = (m,param_comb[best_idx])


{'f1_score': 0.584958217270195}
Loss 0.9144504368305206
Epoch 2/5
-------------------------------
{'f1_score': 0.5814763231197771}
Loss 0.8196359799967872
Epoch 3/5
-------------------------------
{'f1_score': 0.6149025069637883}
Loss 0.7374294989638859
Epoch 4/5
-------------------------------
{'f1_score': 0.6051532033426184}
Loss 0.6596722775035434
Epoch 5/5
-------------------------------
{'f1_score': 0.6030640668523677}
Loss 0.6017037348614799
Done!
{'dropout': 0.2, 'epochs': 5, 'learning_rate': 0.01}
Epoch 1/5
-------------------------------
{'f1_score': 0.6337047353760445}
Loss 0.8782797118028005
Epoch 2/5
-------------------------------
{'f1_score': 0.6344011142061281}
Loss 0.7511732498804728
Epoch 3/5
-------------------------------
{'f1_score': 0.6274373259052924}
Loss 0.6911866698000166
Epoch 4/5
-------------------------------
{'f1_score': 0.6371866295264624}
Loss 0.6460021515687306
Epoch 5/5
-------------------------------
{'f1_score': 0.6392757660167131}
Loss 0.61564947134

  best_score = (m,param_comb[best_idx])


In [22]:
best_val_per_model

[('LaBSE', {'learning_rate': 0.01, 'epochs': 5, 'dropout': 0.2}),
 ('mUSE', {'learning_rate': 0.01, 'epochs': 5, 'dropout': 0.2})]

In [23]:
def perform_testing(best_val_per_model:list,train_df:pd.DataFrame,test_df:pd.DataFrame)->list:
    '''
    Test the model with their best hyperparametes.

    best_val_per_model
    ------------------
        list that stores the best hyperparameters per library
    train_df
    --------
        pandas.DataFrame that stores the training data

    test_df
    --------
        pandas.DataFrame that stores the training data
    
    Returns
    -------
        pd.DataFrame of result f1 score per epoch
        list of tuples which stores all predictions for each model
    '''
    
    res_df = pd.DataFrame(columns=['model', 'f1_score'])
    compar = []
    for val in best_val_per_model:
        res = []
        params = val[1]
        m = val[0]
        res.append(m)
        epochs = params['epochs']
        
        train_ds = CustomDataset(train_df, m,"text", "label")
        train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        
        model =CustomModel(m,drop_out=params['dropout'])
        optimizer = torch.optim.AdamW(model.parameters(), lr=params['learning_rate'])
        
        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=epochs * len(train_dl))
        loaded_model = train_only(model, train_dl,loss, optimizer, scheduler, epochs, val_metrics, device,m)
        torch.save(model.state_dict(), 'model_weights.pth')
        
        
        test_ds = CustomDataset(test_df,m,"text", "label")
        test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
        loaded_model.eval()
        preds = predict(loaded_model, test_dl,device,m )


        model_f1_sc = f1_score(test_df['label'], preds, average='macro')
        res.append(model_f1_sc)
        
        res_df.loc[len(res_df)] = res
        compar.append((m,test_df['label'],preds))
    res_df = res_df.set_index('model')
    return res_df,compar

In [24]:
res_df,model_compar = perform_testing(best_val_per_model,train_df,test_df)

Epoch 1/5
-------------------------------
Loss 0.8156385136916574
Epoch 2/5
-------------------------------
Loss 0.7251854544192289
Epoch 3/5
-------------------------------
Loss 0.6940368477222139
Epoch 4/5
-------------------------------
Loss 0.6564069640847434
Epoch 5/5
-------------------------------
Loss 0.6385601811704382
Done!
Epoch 1/5
-------------------------------
Loss 0.8644441176304775
Epoch 2/5
-------------------------------
Loss 0.7589691059779277
Epoch 3/5
-------------------------------
Loss 0.7027081361914103
Epoch 4/5
-------------------------------
Loss 0.6559503416044522
Epoch 5/5
-------------------------------
Loss 0.6285838033773202
Done!


In [25]:
print(res_df)

       f1_score
model          
LaBSE  0.645179
mUSE   0.601181


In [26]:
def to_short(x:float):
    '''
    Shortens float value to two decimal places
    '''
    return f"{x:.2f}" 

In [27]:
def save_res(latex_table:str,name)->None:
    '''
    Saves the results to latex table
    latex_table
    -----------
        string that defines created latex table
    name
    ----
        string that defines the name of the file
    Returns
    -------
    None
    '''
    with open(name+ '.txt','w+') as f:
        f.write(latex_table)
        f.close()

In [28]:
latex_table = res_df.style.format({
        "f1_score": to_short,
        }).to_latex()

In [29]:
latex_table = res_df.style.format({
        "f1_score": to_short,
        }).to_latex()
latex_table = latex_table.replace('{lr}', '{l||c}')
latex_table = latex_table.replace('model &  \\\\', '\\hline')
latex_table = latex_table.replace('f1_score', 'f1 score')
latex_table = latex_table.replace(' & f1 score','model & f1 score')

In [30]:
save_res(latex_table,'final_score')

In [31]:
def plot_class_charts(model_compar:list,df:pd.DataFrame)->None:
    '''
    Plots graph of distribution of languages of incorrectly classified texts.

    model_compar
    ------------
        list of predictions and correct data per model
    df
    --------
        pd.DataFrame The original dataset
    Returns
    -------
        None
    '''
    for mc in model_compar:
        indexes = []
        for idx,p in enumerate(mc[1].values):
            if p != mc[2][idx]:
                indexes.append(mc[1].index[idx])
        new_df = df.loc[indexes]
        plt.figure(figsize=(8, 6))
        new_df['language'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, colormap='Pastel1')

        plt.title('Incorrectly classified texts with ' + str(mc[0]))
        plt.ylabel('')  
        #plt.show()
        plt.savefig(str(mc[0]) + ".jpg")
        plt.close()

In [32]:
plot_class_charts(model_compar,df)