# Importing necessary libraries

In [None]:
%env TORCH_USE_CUDA_DSA=1

env: TORCH_USE_CUDA_DSA=1


In [None]:
# !pip install -U transformers
# !pip install -U accelerate

# !pip install transformers[torch]

In [None]:
import pandas as pd
import numpy as np
import random

from google.colab import drive
from os.path import join
import os
from tqdm.auto import tqdm

import accelerate
from transformers import (AutoTokenizer, Trainer, TrainingArguments, pipeline,
                          AutoModelForSequenceClassification, BertForSequenceClassification,
                          BertTokenizer, AdamW, XLNetTokenizer, XLNetForSequenceClassification,
                          RobertaTokenizer, RobertaForSequenceClassification, AutoModelForSeq2SeqLM)
import torch
from torch.utils.data import (Dataset, TensorDataset, DataLoader, RandomSampler,
                              SequentialSampler, WeightedRandomSampler)
from tqdm import tqdm

from sklearn.metrics import (f1_score, classification_report, confusion_matrix,
                             ConfusionMatrixDisplay,accuracy_score,
                             precision_score, recall_score)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Finetuning models

## Load manual coded data

In [None]:
drive.mount('/content/drive')
PROJECT_DIR = "/content/drive/MyDrive/Thesis/Data"
FEEDBACK_DIR = join(PROJECT_DIR, "Feedback data")
PROCESSED_DIR = join(PROJECT_DIR, "Processed")

Mounted at /content/drive


Load the manually coded dataset:

In [None]:
df = pd.read_csv(join(PROJECT_DIR,"Annotation data/manual_coding_majority_vote_2000_(emotionality_recoding).csv"), encoding='latin1')

# replace the NA from one empty text
df.at[1726, 'comment'] = " "

# delete columns
col_delete = ['t_other']
df = df.drop(columns=col_delete)

In [None]:
df.head(4)

Unnamed: 0,comment,global_id,t_communication,t_payment,t_refund,t_price,t_value,t_shipping,t_product,t_feedback,t_vendor,t_generic,t_overall,emo_une
0,great vendor,ao1003730,0,0,0,0,0,0,0,0,1,0,0,0
1,good shit. cone pretty small conpared to stock...,ao1005428,0,0,0,0,0,0,1,0,0,0,0,0
2,very nice,ao1007033,0,0,0,0,0,0,0,0,0,0,1,0
3,arrived in 3 days! Top quality product. this...,ao1008851,0,0,0,0,0,1,1,0,1,0,0,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   comment          2000 non-null   object
 1   global_id        2000 non-null   object
 2   t_communication  2000 non-null   int64 
 3   t_payment        2000 non-null   int64 
 4   t_refund         2000 non-null   int64 
 5   t_price          2000 non-null   int64 
 6   t_value          2000 non-null   int64 
 7   t_shipping       2000 non-null   int64 
 8   t_product        2000 non-null   int64 
 9   t_feedback       2000 non-null   int64 
 10  t_vendor         2000 non-null   int64 
 11  t_generic        2000 non-null   int64 
 12  t_overall        2000 non-null   int64 
 13  emo_une          2000 non-null   int64 
dtypes: int64(12), object(2)
memory usage: 218.9+ KB


In [None]:
df.shape

(2000, 14)

Create the summary table

In [None]:
selected_col = df[[col for col in df if col.startswith('t_')]+ ['emo_une']]
summary_table = selected_col.apply(lambda x: pd.Series([x.value_counts().get(0, 0), x.value_counts().get(1, 0)], index=['0', '1']))
summary_table = summary_table.transpose()
summary_table

Unnamed: 0,0,1
t_communication,1646,354
t_payment,1925,75
t_refund,1904,96
t_price,1948,52
t_value,1888,112
t_shipping,984,1016
t_product,1111,889
t_feedback,1940,60
t_vendor,1657,343
t_generic,1874,126


## Preprocessing

Split Dataset to Train-Val-Test

I made here mistake in splititng data. I planned to divide it into: 90% for the train and 5% for the val and test values. I leave it since everything is done on these sets

In [None]:
train_df_, val_df = train_test_split(df,
                                    test_size=0.10,
                                    random_state=123)

train_df, test_df = train_test_split(train_df_,
                                    test_size=0.10,
                                    random_state=123)

# Check the sizes of the sets
print("Size of training set:", len(train_df))
print("Size of validation set:", len(val_df))
print("Size of test set:", len(test_df))
print("Shape of training set:", train_df.shape)

Size of training set: 1620
Size of validation set: 200
Size of test set: 180
Shape of training set: (1620, 23)


Setting device

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Defining the performance metrics

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted', zero_division=1)
    return {
        'accuracy': accuracy,
        'f1': f1
    }


Create MyDataset - preprocessed dataset

In [None]:
class MyDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: tensor[idx] for key, tensor in self.encodings.items()}

        # Dodaj klucz 'labels' do elementu, nawet jeśli są None
        item['labels'] = self.labels[idx] if self.labels is not None else None

        return item

## Topics & Emotion

In [None]:
# Define the column names
columns = [
    't_communication', 't_payment' , 't_refund', 't_price', 't_value',
           't_shipping', 't_product', 't_feedback', 't_vendor', 't_generic','t_overall',
           'emo_une']

confusion_matrix_title = [
                         'Topic Communication',
                         'Topic Payment / finalise early',
                         'Topic Refund',
                         'Topic Extras / goodies',
                         'Topic Value for money',
                         'Topic Delivery / stealth',
                         'Topic Product quality',
                         'Topic Leaving / changing feedback',
                         'Topic Vendor quality',
                         'Topic Generic rating',
                         'Topic Overall experience',
                         'Text Emotionality Assessment']



In [None]:
# Function to calculate class weights
def calculate_class_weights(labels):
    class_counts = np.bincount(labels)
    total_samples = len(labels)
    class_weights = [total_samples / class_counts[i] for i in range(len(class_counts))]
    return torch.tensor(class_weights, dtype=torch.float)

In [None]:
# Custom Trainer class to handle weighted loss
class WeightedTrainer(Trainer):
    def __init__(self, *args, weight=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.weight = weight

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.weight)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

### 1 Baseline

In [None]:
# Initialize metrics dataframe for Baseline
baseline_metrics_df = pd.DataFrame(columns=['Category', 'Accuracy', 'F1-Score',
                                            'F1-Score_0', 'F1-Score_1'])

# Random seed for reproducibility
random.seed(12345)

# Iterate through each column
for column in columns:
    print(f"Generating baseline predictions for category: {column}")

    # Get true labels
    true_labels = test_df[column].values
    labels = np.unique(true_labels)

    # Generate random predictions
    baseline_pred = [random.choice(labels) for _ in range(len(test_df))]

    # Calculate metrics
    report = classification_report(true_labels, baseline_pred, output_dict=True, zero_division=1)

    # Collect metrics
    metrics_row = pd.DataFrame([{
        'Category': column,
        'Accuracy': report['accuracy'],
        'F1-Score': report['weighted avg']['f1-score'],
        'F1-Score_0': report['0']['f1-score'],
        'F1-Score_1': report['1']['f1-score']
    }])

    baseline_metrics_df = pd.concat([baseline_metrics_df, metrics_row], ignore_index=True)

# Calculate average metrics for the Baseline model
avg_accuracy_all = baseline_metrics_df['Accuracy'].mean()
avg_f1_score_all = baseline_metrics_df['F1-Score'].mean()

avg_accuracy_t = baseline_metrics_df[baseline_metrics_df['Category'].str.startswith('t_')]['Accuracy'].mean()
avg_f1_score_t = baseline_metrics_df[baseline_metrics_df['Category'].str.startswith('t_')]['F1-Score'].mean()

avg_accuracy_m = baseline_metrics_df[baseline_metrics_df['Category'].str.startswith('m_')]['Accuracy'].mean()
avg_f1_score_m = baseline_metrics_df[baseline_metrics_df['Category'].str.startswith('m_')]['F1-Score'].mean()

# Append average metrics to the DataFrame for the Baseline model
avg_baseline_metrics_row = pd.DataFrame([{
    'Category': 'Avg_t_Columns',
    'Accuracy': avg_accuracy_t,
    'F1-Score': avg_f1_score_t,
    'F1-Score_0': baseline_metrics_df[baseline_metrics_df['Category'].str.startswith('t_')]['F1-Score_0'].mean(),
    'F1-Score_1': baseline_metrics_df[baseline_metrics_df['Category'].str.startswith('t_')]['F1-Score_1'].mean()
}])

baseline_metrics_df = pd.concat([baseline_metrics_df, avg_baseline_metrics_row], ignore_index=True)


Generating baseline predictions for category: t_communication
Generating baseline predictions for category: t_payment
Generating baseline predictions for category: t_refund
Generating baseline predictions for category: t_price
Generating baseline predictions for category: t_value
Generating baseline predictions for category: t_shipping
Generating baseline predictions for category: t_product
Generating baseline predictions for category: t_feedback
Generating baseline predictions for category: t_vendor
Generating baseline predictions for category: t_generic
Generating baseline predictions for category: t_overall
Generating baseline predictions for category: emo_une


In [None]:
baseline_metrics_df

Unnamed: 0,Category,Accuracy,F1-Score,F1-Score_0,F1-Score_1
0,t_communication,0.577778,0.61628,0.68595,0.355932
1,t_payment,0.494444,0.641463,0.659176,0.021505
2,t_refund,0.5,0.620381,0.661654,0.042553
3,t_price,0.477778,0.624543,0.641221,0.040816
4,t_value,0.533333,0.650884,0.681818,0.125
5,t_shipping,0.533333,0.534381,0.481481,0.575758
6,t_product,0.561111,0.562553,0.606965,0.503145
7,t_feedback,0.488889,0.635797,0.648855,0.061224
8,t_vendor,0.461111,0.519215,0.580087,0.248062
9,t_generic,0.555556,0.674074,0.703704,0.111111


In [None]:
baseline_metrics_df.to_csv(join(PROJECT_DIR, "Results/metrics/Baseline.csv"))

### 2 BERT

In [None]:
# Load tokenizer
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Encode the data
encoded_data_train = tokenizer.batch_encode_plus(
    train_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    val_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    test_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

In [None]:
# Initialize metrics dataframe
metrics_df = pd.DataFrame(columns=['Category', 'Accuracy', 'F1-Score',
                                   'F1-Score_0', 'F1-Score_1'])

# Create data frame to save prediction value:
sample_prediciton = test_df[['comment', 'global_id']+ columns].copy()

# Create directory to save confusion matrices if it doesn't exist
conf_matrix_dir = join(PROJECT_DIR, "Results/Confusion_matrix/BERT")
if not os.path.exists(conf_matrix_dir):
    os.makedirs(conf_matrix_dir)

In [None]:
# Iterate through each column
for idx, column in enumerate(columns):
    print(f"Training on category: {column}")

    # Create datasets for the current category
    train_dataset = MyDataset(encoded_data_train, train_df[column].values)
    val_dataset = MyDataset(encoded_data_val, val_df[column].values)
    test_dataset = MyDataset(encoded_data_test, test_df[column].values)

    # Calculate class weights
    class_weights = calculate_class_weights(train_df[column].values)
    class_weights = class_weights.to(device)

    # Load model
    model = (AutoModelForSequenceClassification
             .from_pretrained(model_ckpt, num_labels=2)
             .to(device))

    # Define training arguments
    batch_size = 32
    logging_steps = len(train_dataset) // batch_size

    model_name = f"{model_ckpt}-finetuned-{column}"
    training_args = TrainingArguments(
        output_dir=model_name,
        num_train_epochs=3,
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        eval_strategy="epoch",
        disable_tqdm=False,
        logging_steps=logging_steps,
        push_to_hub=False,
        log_level="error"
    )

    # Initialize Trainer with custom weighted loss
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        weight=class_weights
    )

    # Train the model
    trainer.train()

    # Evaluate on validation dataset
    preds_output = trainer.predict(val_dataset)
    metrics = preds_output.metrics

    # Predict on test dataset
    pred = np.argmax(trainer.predict(test_dataset).predictions, axis=1)
    report = classification_report(test_df[column], pred, output_dict=True, zero_division=1)

    # Collect metrics
    metrics_row = pd.DataFrame([{
        'Category': column,
        'Accuracy': report['accuracy'],
        'F1-Score': report['weighted avg']['f1-score'],
        'F1-Score_0': report['0']['f1-score'],
        'F1-Score_1': report['1']['f1-score']
    }])

    metrics_df = pd.concat([metrics_df, metrics_row], ignore_index=True)

    # Save sample prediction to file
    sample_prediciton[f'pred_{column}'] = pred

    # Push the model to the hub
    trainer.push_to_hub(model_name)

    # Plot confusion matrix
    conf_matrix = confusion_matrix(test_df[column], pred)
    # Calculate percentages
    total = np.sum(conf_matrix)
    percentages = (conf_matrix / total * 100).flatten()

    # Combine counts and percentages in the labels
    labels = np.array(["{0}\n{1:.1f}%".format(count, pct) if pct != 0 else "{0}\n0%".format(count)
                      for count, pct in zip(conf_matrix.flatten(), percentages)]).reshape(2, 2)

    # Create the heatmap
    plt.figure(figsize=(5, 5))
    ax = sns.heatmap(conf_matrix, annot=labels, fmt='', cmap='PuBu',  cbar=False,
                    xticklabels=['No', 'Yes'],
                    yticklabels=['No', 'Yes'])
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0, va='center', ha='center', fontsize=9)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=90, va='center', ha='center', fontsize=9)

    # Title and labels
    plt.title(f'BERT: {confusion_matrix_title[idx]}', fontsize=10, fontweight='bold')
    plt.xlabel('Predicted Label', fontweight='bold')
    plt.ylabel('True Label', fontweight='bold')

    # Add a border around the heatmap
    for _, spine in ax.spines.items():
        spine.set_visible(True)
        spine.set_linewidth(0.35)
        spine.set_color('black')
    plt.savefig(os.path.join(conf_matrix_dir, f'confusion_matrix_{column}.png'))
    plt.close()

Training on category: t_communication


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5398,0.303789,0.855,0.86872
2,0.2611,0.209369,0.945,0.945304
3,0.143,0.193457,0.945,0.945304


events.out.tfevents.1719351303.5d90d51bf3f6.506.12:   0%|          | 0.00/6.95k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1719351197.5d90d51bf3f6.506.11:   0%|          | 0.00/4.88k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Training on category: t_payment


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.644,0.550316,0.88,0.904029
2,0.3376,0.281966,0.98,0.98
3,0.2208,0.214132,0.975,0.975562


training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

events.out.tfevents.1719351378.5d90d51bf3f6.506.13:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

Training on category: t_refund


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5116,0.49926,0.95,0.95
2,0.2407,0.314891,0.965,0.968572
3,0.0987,0.41413,0.975,0.974297


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1719351452.5d90d51bf3f6.506.14:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Training on category: t_price


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6325,0.456471,0.92,0.94541
2,0.4541,0.34398,0.99,0.987525
3,0.2505,0.312603,0.99,0.987525


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

events.out.tfevents.1719351046.5d90d51bf3f6.506.10:   0%|          | 0.00/5.99k [00:00<?, ?B/s]

events.out.tfevents.1719351526.5d90d51bf3f6.506.15:   0%|          | 0.00/6.92k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Training on category: t_value


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6312,0.425797,0.955,0.959593
2,0.3237,0.211161,0.975,0.975625
3,0.1912,0.211066,0.98,0.98


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1719351638.5d90d51bf3f6.506.16:   0%|          | 0.00/6.92k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Training on category: t_shipping


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5765,0.46697,0.805,0.804419
2,0.3081,0.302205,0.885,0.884655
3,0.1677,0.2976,0.88,0.879564


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

events.out.tfevents.1719351728.5d90d51bf3f6.506.17:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

Training on category: t_product


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6107,0.533032,0.74,0.733475
2,0.3951,0.427587,0.83,0.829684
3,0.2774,0.423034,0.825,0.825417


events.out.tfevents.1719351819.5d90d51bf3f6.506.18:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Training on category: t_feedback


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.47,0.13698,1.0,1.0
2,0.3169,0.052518,0.995,0.995494
3,0.1096,0.0255,1.0,1.0


events.out.tfevents.1719351909.5d90d51bf3f6.506.19:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

Training on category: t_vendor


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6159,0.481164,0.785,0.814385
2,0.3706,0.377357,0.85,0.861382
3,0.2443,0.37157,0.855,0.867641


events.out.tfevents.1719352001.5d90d51bf3f6.506.20:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Training on category: t_generic


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.418,0.344593,0.965,0.965653
2,0.1364,0.41741,0.965,0.965653
3,0.0419,0.459234,0.975,0.97449


events.out.tfevents.1719352092.5d90d51bf3f6.506.21:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Training on category: t_overall


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6043,0.523307,0.905,0.90863
2,0.439,0.498894,0.91,0.906595
3,0.3395,0.488536,0.91,0.908381


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

events.out.tfevents.1719352186.5d90d51bf3f6.506.22:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

Training on category: emo_une


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5909,0.490515,0.775,0.788626
2,0.429,0.496863,0.805,0.809475
3,0.3494,0.501724,0.81,0.8138


training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

events.out.tfevents.1719352287.5d90d51bf3f6.506.23:   0%|          | 0.00/6.92k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1719350212.5d90d51bf3f6.506.6:   0%|          | 0.00/4.85k [00:00<?, ?B/s]

In [None]:
# Calculate average metrics for the current model
avg_accuracy_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['Accuracy'].mean()
avg_f1_score_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score'].mean()
avg_f1_score_0_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score_0'].mean()
avg_f1_score_1_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score_1'].mean()

# Append average metrics to the DataFrame for the current model
avg_metrics_row = pd.DataFrame([{
    'Category': 'Avg_t_Columns',
    'Accuracy': avg_accuracy_t,
    'F1-Score': avg_f1_score_t,
    'F1-Score_0': avg_f1_score_0_t,
    'F1-Score_1': avg_f1_score_1_t
}])

metrics_df = pd.concat([metrics_df, avg_metrics_row], ignore_index=True)

metrics_df.to_csv(join(PROJECT_DIR, "Results/metrics/BERT.csv"))

In [None]:
metrics_df

Unnamed: 0,Category,Accuracy,F1-Score,F1-Score_0,F1-Score_1
0,t_communication,0.927778,0.92936,0.953405,0.839506
1,t_payment,0.972222,0.975307,0.985591,0.615385
2,t_refund,0.977778,0.976833,0.988166,0.818182
3,t_price,0.977778,0.975063,0.988636,0.5
4,t_value,0.966667,0.962694,0.982558,0.625
5,t_shipping,0.877778,0.877931,0.8625,0.89
6,t_product,0.811111,0.809467,0.841121,0.767123
7,t_feedback,0.972222,0.973726,0.985755,0.444444
8,t_vendor,0.888889,0.894243,0.929577,0.736842
9,t_generic,0.977778,0.979666,0.988166,0.818182


Save sample prediction:

In [None]:
sample_prediciton.to_csv(join(PROCESSED_DIR, "SamplePrediction/BERT_sample_predicted_final.csv"), index=False)

### 3 RoBERTa

In [None]:
# Load tokenizer
model_ckpt = "roberta-large"
tokenizer = RobertaTokenizer.from_pretrained(model_ckpt)

# Encode the data
encoded_data_train = tokenizer.batch_encode_plus(
    train_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    val_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    test_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

In [None]:
# Initialize metrics dataframe
metrics_df = pd.DataFrame(columns=['Category', 'Accuracy', 'F1-Score',
                                   'F1-Score_0', 'F1-Score_1'])

# Create data frame to save prediction value:
sample_prediciton = test_df[['comment', 'global_id']+ columns].copy()

# Create directory to save confusion matrices if it doesn't exist
conf_matrix_dir = join(PROJECT_DIR, "Results/Confusion_matrix/RoBERTa")
if not os.path.exists(conf_matrix_dir):
    os.makedirs(conf_matrix_dir)

In [None]:
# Iterate through each column
for idx, column in enumerate(columns):
    print(f"Training on category: {column}")

    # Create datasets for the current category
    train_dataset = MyDataset(encoded_data_train, train_df[column].values)
    val_dataset = MyDataset(encoded_data_val, val_df[column].values)
    test_dataset = MyDataset(encoded_data_test, test_df[column].values)

    # Calculate class weights
    class_weights = calculate_class_weights(train_df[column].values)
    class_weights = class_weights.to(device)

    # Load model
    model = (AutoModelForSequenceClassification
             .from_pretrained(model_ckpt, num_labels=2)
             .to(device))

    # Define training arguments
    batch_size = 32
    logging_steps = len(train_dataset) // batch_size

    model_name = f"{model_ckpt}-finetuned-{column}"
    training_args = TrainingArguments(
        output_dir=model_name,
        num_train_epochs=3,
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        eval_strategy="epoch",
        disable_tqdm=False,
        logging_steps=logging_steps,
        push_to_hub=False,
        log_level="error"
    )

    # Initialize Trainer with custom weighted loss
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        weight=class_weights
    )

    # Train the model
    trainer.train()

    # Evaluate on validation dataset
    preds_output = trainer.predict(val_dataset)
    metrics = preds_output.metrics

    # Predict on test dataset
    pred = np.argmax(trainer.predict(test_dataset).predictions, axis=1)
    report = classification_report(test_df[column], pred, output_dict=True, zero_division=1)

    # Collect metrics
    metrics_row = pd.DataFrame([{
        'Category': column,
        'Accuracy': report['accuracy'],
        'F1-Score': report['weighted avg']['f1-score'],
        'F1-Score_0': report['0']['f1-score'],
        'F1-Score_1': report['1']['f1-score']
    }])

    metrics_df = pd.concat([metrics_df, metrics_row], ignore_index=True)

    # Save sample prediction to file
    sample_prediciton[f'pred_{column}'] = pred

    # Push the model to the hub
    model_name = f"RoBERTa_motive_topic_{column}"
    trainer.push_to_hub(model_name)

    # Plot confusion matrix
    conf_matrix = confusion_matrix(test_df[column], pred)
    # Calculate percentages
    total = np.sum(conf_matrix)
    percentages = (conf_matrix / total * 100).flatten()

    # Combine counts and percentages in the labels
    labels = np.array(["{0}\n{1:.1f}%".format(count, pct) if pct != 0 else "{0}\n0%".format(count)
                      for count, pct in zip(conf_matrix.flatten(), percentages)]).reshape(2, 2)

    # Create the heatmap
    plt.figure(figsize=(5, 5))
    ax = sns.heatmap(conf_matrix, annot=labels, fmt='', cmap='PuBu',  cbar=False,
                    xticklabels=['No', 'Yes'],
                    yticklabels=['No', 'Yes'])
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0, va='center', ha='center', fontsize=9)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=90, va='center', ha='center', fontsize=9)

    # Title and labels
    plt.title(f'RoBERTa: {confusion_matrix_title[idx]}', fontsize=10, fontweight='bold')
    plt.xlabel('Predicted Label', fontweight='bold')
    plt.ylabel('True Label', fontweight='bold')

    # Add a border around the heatmap
    for _, spine in ax.spines.items():
        spine.set_visible(True)
        spine.set_linewidth(0.35)
        spine.set_color('black')
    plt.savefig(os.path.join(conf_matrix_dir, f'confusion_matrix_{column}.png'))
    plt.close()

Training on category: emo_une


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6138,0.444329,0.825,0.830904
2,0.433,0.48098,0.865,0.86343
3,0.3137,0.586446,0.865,0.865489


training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

events.out.tfevents.1719371153.f2cb808dedd8.367.0:   0%|          | 0.00/6.92k [00:00<?, ?B/s]

In [None]:
# Calculate average metrics for the current model
avg_accuracy_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['Accuracy'].mean()
avg_f1_score_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score'].mean()
avg_f1_score_0_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score_0'].mean()
avg_f1_score_1_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score_1'].mean()

# Append average metrics to the DataFrame for the current model
avg_metrics_row = pd.DataFrame([{
    'Category': 'Avg_t_Columns',
    'Accuracy': avg_accuracy_t,
    'F1-Score': avg_f1_score_t,
    'F1-Score_0': avg_f1_score_0_t,
    'F1-Score_1': avg_f1_score_1_t
}])

metrics_df = pd.concat([metrics_df, avg_metrics_row], ignore_index=True)
metrics_df.to_csv(join(PROJECT_DIR, "Results/metrics/RoBERTa.csv"))

In [None]:
metrics_df

Save sample prediction:

In [None]:
sample_prediciton.to_csv(join(PROCESSED_DIR, "SamplePrediction/RoBERTa_sample_predicted_final.csv"), index=False)

### 4 DistilBERT

In [None]:
# Load tokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Encode the data
encoded_data_train = tokenizer.batch_encode_plus(
    train_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    val_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    test_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# Initialize metrics dataframe
metrics_df = pd.DataFrame(columns=['Category', 'Accuracy', 'F1-Score',
                                   'F1-Score_0', 'F1-Score_1'])

# Create data frame to save prediction value:
sample_prediciton = test_df[['comment', 'global_id']+ columns].copy()

# Create directory to save confusion matrices if it doesn't exist
conf_matrix_dir = join(PROJECT_DIR, "Results/Confusion_matrix/DistilBERT")
if not os.path.exists(conf_matrix_dir):
    os.makedirs(conf_matrix_dir)

In [None]:
# Iterate through each column
for idx, column in enumerate(columns):
    print(f"Training on category: {column}")

    # Create datasets for the current category
    train_dataset = MyDataset(encoded_data_train, train_df[column].values)
    val_dataset = MyDataset(encoded_data_val, val_df[column].values)
    test_dataset = MyDataset(encoded_data_test, test_df[column].values)

    # Calculate class weights
    class_weights = calculate_class_weights(train_df[column].values)
    class_weights = class_weights.to(device)

    # Load model
    model = (AutoModelForSequenceClassification
             .from_pretrained(model_ckpt, num_labels=2)
             .to(device))

    # Define training arguments
    batch_size = 64
    logging_steps = len(train_dataset) // batch_size

    model_name = f"{model_ckpt}-finetuned-{column}"
    training_args = TrainingArguments(
        output_dir=model_name,
        num_train_epochs=5,
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        eval_strategy="epoch",
        disable_tqdm=False,
        logging_steps=logging_steps,
        push_to_hub=False,
        log_level="error"
    )

    # Initialize Trainer with custom weighted loss
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        weight=class_weights
    )

    # Train the model
    trainer.train()

    # Evaluate on validation dataset
    preds_output = trainer.predict(val_dataset)
    metrics = preds_output.metrics

    # Predict on test dataset
    pred = np.argmax(trainer.predict(test_dataset).predictions, axis=1)
    report = classification_report(test_df[column], pred, output_dict=True, zero_division=1)

    # Collect metrics
    metrics_row = pd.DataFrame([{
        'Category': column,
        'Accuracy': report['accuracy'],
        'F1-Score': report['weighted avg']['f1-score'],
        'F1-Score_0': report['0']['f1-score'],
        'F1-Score_1': report['1']['f1-score']
    }])

    metrics_df = pd.concat([metrics_df, metrics_row], ignore_index=True)

    # Save sample prediction to file
    sample_prediciton[f'pred_{column}'] = pred

    # Push the model to the hub
    model_name = f"DistilBERT_motive_topic_{column}"
    trainer.push_to_hub(model_name)

    conf_matrix = confusion_matrix(test_df[column], pred)

    # Plot confusion matrix
    conf_matrix = confusion_matrix(test_df[column], pred)
    # Calculate percentages
    total = np.sum(conf_matrix)
    percentages = (conf_matrix / total * 100).flatten()

    # Combine counts and percentages in the labels
    labels = np.array(["{0}\n{1:.1f}%".format(count, pct) if pct != 0 else "{0}\n0%".format(count)
                      for count, pct in zip(conf_matrix.flatten(), percentages)]).reshape(2, 2)

    # Create the heatmap
    plt.figure(figsize=(5, 5))
    ax = sns.heatmap(conf_matrix, annot=labels, fmt='', cmap='PuBu',  cbar=False,
                    xticklabels=['No', 'Yes'],
                    yticklabels=['No', 'Yes'])
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0, va='center', ha='center', fontsize=9)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=90, va='center', ha='center', fontsize=9)

    # Title and labels
    plt.title(f'DistilBERT: {confusion_matrix_title[idx]}', fontsize=10, fontweight='bold')
    plt.xlabel('Predicted Label', fontweight='bold')
    plt.ylabel('True Label', fontweight='bold')

    # Add a border around the heatmap
    for _, spine in ax.spines.items():
        spine.set_visible(True)
        spine.set_linewidth(0.35)
        spine.set_color('black')
    plt.savefig(os.path.join(conf_matrix_dir, f'confusion_matrix_{column}.png'))
    plt.close()

Training on category: emo_une


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6217,0.517574,0.795,0.801916
2,0.4629,0.455523,0.805,0.814197
3,0.3877,0.455798,0.795,0.801916
4,0.3343,0.46404,0.805,0.809475
5,0.3068,0.46189,0.79,0.797601


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

events.out.tfevents.1719371838.f2cb808dedd8.367.1:   0%|          | 0.00/7.96k [00:00<?, ?B/s]

In [None]:
# Calculate average metrics for the current model
avg_accuracy_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['Accuracy'].mean()
avg_f1_score_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score'].mean()
avg_f1_score_0_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score_0'].mean()
avg_f1_score_1_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score_1'].mean()

# Append average metrics to the DataFrame for the current model
avg_metrics_row = pd.DataFrame([{
    'Category': 'Avg_t_Columns',
    'Accuracy': avg_accuracy_t,
    'F1-Score': avg_f1_score_t,
    'F1-Score_0': avg_f1_score_0_t,
    'F1-Score_1': avg_f1_score_1_t
}])

metrics_df = pd.concat([metrics_df, avg_metrics_row], ignore_index=True)
metrics_df.to_csv(join(PROJECT_DIR, "Results/metrics/DistilBERT.csv"))

In [None]:
metrics_df

Save sample prediction:

In [None]:
sample_prediciton.to_csv(join(PROCESSED_DIR, "SamplePrediction/DistilBERT_sample_predicted_final.csv"), index=False)

### 5 DeBERTa

In [None]:
# Load tokenizer
model_ckpt = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Encode the data
encoded_data_train = tokenizer.batch_encode_plus(
    train_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    val_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    test_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [None]:
# Initialize metrics dataframe
metrics_df = pd.DataFrame(columns=['Category', 'Accuracy', 'F1-Score',
                                   'F1-Score_0', 'F1-Score_1'])

# Create data frame to save prediction value:
sample_prediciton = test_df[['comment', 'global_id']+ columns].copy()

# Create directory to save confusion matrices if it doesn't exist
conf_matrix_dir = join(PROJECT_DIR, "Results/Confusion_matrix/DeBERTa")
if not os.path.exists(conf_matrix_dir):
    os.makedirs(conf_matrix_dir)

In [None]:
# Iterate through each column
for idx, column in enumerate(columns):
    print(f"Training on category: {column}")

    # Create datasets for the current category
    train_dataset = MyDataset(encoded_data_train, train_df[column].values)
    val_dataset = MyDataset(encoded_data_val, val_df[column].values)
    test_dataset = MyDataset(encoded_data_test, test_df[column].values)

    # Calculate class weights
    class_weights = calculate_class_weights(train_df[column].values)
    class_weights = class_weights.to(device)

    # Load model
    model = (AutoModelForSequenceClassification
             .from_pretrained(model_ckpt, num_labels=2)
             .to(device))

    # Define training arguments
    batch_size = 64
    logging_steps = len(train_dataset) // batch_size

    model_name = f"{model_ckpt}-finetuned-{column}"
    training_args = TrainingArguments(
        output_dir=model_name,
        num_train_epochs=5,
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        eval_strategy="epoch",
        disable_tqdm=False,
        logging_steps=logging_steps,
        push_to_hub=False,
        log_level="error"
    )

    # Initialize Trainer with custom weighted loss
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        weight=class_weights
    )

    # Train the model
    trainer.train()

    # Evaluate on validation dataset
    preds_output = trainer.predict(val_dataset)
    metrics = preds_output.metrics

    # Predict on test dataset
    pred = np.argmax(trainer.predict(test_dataset).predictions, axis=1)
    report = classification_report(test_df[column], pred, output_dict=True, zero_division=1)

    # Collect metrics
    metrics_row = pd.DataFrame([{
        'Category': column,
        'Accuracy': report['accuracy'],
        'F1-Score': report['weighted avg']['f1-score'],
        'F1-Score_0': report['0']['f1-score'],
        'F1-Score_1': report['1']['f1-score']
    }])

    metrics_df = pd.concat([metrics_df, metrics_row], ignore_index=True)

    # Save prediction to new column
    sample_prediciton[f'pred_{column}'] = pred

    # Push the model to the hub
    trainer.push_to_hub(model_name)

    conf_matrix = confusion_matrix(test_df[column], pred)

    # Plot confusion matrix
    conf_matrix = confusion_matrix(test_df[column], pred)
    # Calculate percentages
    total = np.sum(conf_matrix)
    percentages = (conf_matrix / total * 100).flatten()

    # Combine counts and percentages in the labels
    labels = np.array(["{0}\n{1:.1f}%".format(count, pct) if pct != 0 else "{0}\n0%".format(count)
                      for count, pct in zip(conf_matrix.flatten(), percentages)]).reshape(2, 2)

    # Create the heatmap
    plt.figure(figsize=(5, 5))
    ax = sns.heatmap(conf_matrix, annot=labels, fmt='', cmap='PuBu',  cbar=False,
                    xticklabels=['No', 'Yes'],
                    yticklabels=['No', 'Yes'])
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0, va='center', ha='center', fontsize=9)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=90, va='center', ha='center', fontsize=9)

    # Title and labels
    plt.title(f'DeBERTa V3: {confusion_matrix_title[idx]}', fontsize=10, fontweight='bold')
    plt.xlabel('Predicted Label', fontweight='bold')
    plt.ylabel('True Label', fontweight='bold')

    # Add a border around the heatmap
    for _, spine in ax.spines.items():
        spine.set_visible(True)
        spine.set_linewidth(0.35)
        spine.set_color('black')
    plt.savefig(os.path.join(conf_matrix_dir, f'confusion_matrix_{column}.png'))
    plt.close()

Training on category: t_communication


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6469,0.422156,0.815,0.834358
2,0.3381,0.152035,0.935,0.938293
3,0.1774,0.170187,0.925,0.929336
4,0.1385,0.162509,0.96,0.960836
5,0.102,0.174759,0.945,0.946904


model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1719353690.8619c8f6a36e.841.0:   0%|          | 0.00/8.26k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Training on category: t_payment


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6873,0.704308,0.95,0.925641
2,0.66,0.666924,0.955,0.937111
3,0.5209,0.512937,0.805,0.855604
4,0.2546,0.608935,0.97,0.968412
5,0.1266,0.691632,0.975,0.972892


events.out.tfevents.1719353854.8619c8f6a36e.841.1:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

Training on category: t_refund


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6724,0.62742,0.955,0.933018
2,0.4383,0.340172,0.94,0.949908
3,0.2518,0.344391,0.975,0.976687
4,0.1072,0.479081,0.975,0.975625
5,0.1466,0.4533,0.97,0.971421


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1719354001.8619c8f6a36e.841.2:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Training on category: t_price


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.681,0.612255,0.985,0.977557
2,0.559,0.386951,0.975,0.976754
3,0.2592,0.112973,1.0,1.0
4,0.1632,0.043036,1.0,1.0
5,0.0234,0.011421,1.0,1.0


events.out.tfevents.1719354142.8619c8f6a36e.841.3:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Training on category: t_value


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6946,0.679234,0.395,0.518088
2,0.6304,0.490001,0.82,0.870636
3,0.4186,0.362089,0.97,0.972569
4,0.2977,0.308858,0.97,0.972569
5,0.2423,0.273558,0.97,0.972569


events.out.tfevents.1719354291.8619c8f6a36e.841.4:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Training on category: t_shipping


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6587,0.570415,0.695,0.68127
2,0.3813,0.287708,0.91,0.910036
3,0.163,0.205511,0.945,0.945034
4,0.1083,0.203976,0.945,0.945029
5,0.0811,0.225797,0.94,0.94


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

events.out.tfevents.1719354436.8619c8f6a36e.841.5:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

Training on category: t_product


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6779,0.619417,0.63,0.613621
2,0.512,0.447949,0.825,0.822719
3,0.3284,0.367584,0.865,0.8655
4,0.2196,0.358104,0.86,0.86023
5,0.1563,0.361534,0.865,0.864612


events.out.tfevents.1719354581.8619c8f6a36e.841.6:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

Training on category: t_feedback


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6709,0.569016,0.99,0.985025
2,0.4963,0.496605,0.825,0.895086
3,0.2759,0.260047,0.985,0.986481
4,0.4094,0.105482,0.99,0.991641
5,0.211,0.07297,0.985,0.988157


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

events.out.tfevents.1719354727.8619c8f6a36e.841.7:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

Training on category: t_vendor


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.685,0.647895,0.51,0.56865
2,0.5257,0.356657,0.845,0.863653
3,0.2651,0.297105,0.885,0.895806
4,0.1958,0.333769,0.9,0.906753
5,0.1535,0.339526,0.895,0.902538


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1719354881.8619c8f6a36e.841.8:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Training on category: t_generic


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6483,0.465427,0.79,0.843125
2,0.3702,0.281373,0.985,0.98528
3,0.2238,0.237364,0.98,0.98
4,0.1812,0.177869,0.985,0.98528
5,0.1251,0.145057,0.985,0.98528


events.out.tfevents.1719355022.8619c8f6a36e.841.9:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Training on category: t_overall


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.679,0.619755,0.8,0.823848
2,0.5234,0.424955,0.83,0.85131
3,0.3869,0.424908,0.87,0.88151
4,0.3071,0.461432,0.875,0.884242
5,0.2313,0.48587,0.885,0.892254


events.out.tfevents.1719355168.8619c8f6a36e.841.10:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

Training on category: emo_une


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.68,0.626895,0.585,0.610693
2,0.5312,0.455174,0.86,0.857792
3,0.3854,0.447794,0.84,0.844127
4,0.3005,0.439513,0.86,0.864366
5,0.258,0.450045,0.865,0.868098


spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

events.out.tfevents.1719355314.8619c8f6a36e.841.11:   0%|          | 0.00/8.23k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# Calculate average metrics for the current model
avg_accuracy_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['Accuracy'].mean()
avg_f1_score_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score'].mean()
avg_f1_score_0_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score_0'].mean()
avg_f1_score_1_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score_1'].mean()

# Append average metrics to the DataFrame for the current model
avg_metrics_row = pd.DataFrame([{
    'Category': 'Avg_t_Columns',
    'Accuracy': avg_accuracy_t,
    'F1-Score': avg_f1_score_t,
    'F1-Score_0': avg_f1_score_0_t,
    'F1-Score_1': avg_f1_score_1_t
}])

metrics_df = pd.concat([metrics_df, avg_metrics_row], ignore_index=True)
metrics_df.to_csv(join(PROJECT_DIR, "Results/metrics/DeBERTa.csv"))

In [None]:
metrics_df

Save sample prediction:

In [None]:
sample_prediciton.to_csv(join(PROCESSED_DIR, "SamplePrediction/DeBERTa_sample_predicted_final.csv"), index=False)

### 6 XLM-RoBERTa

In [None]:
# Load tokenizer
model_ckpt = "FacebookAI/xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Encode the data
encoded_data_train = tokenizer.batch_encode_plus(
    train_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    val_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    test_df["comment"].values.tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
# Initialize metrics dataframe
metrics_df = pd.DataFrame(columns=['Category', 'Accuracy', 'F1-Score',
                                   'F1-Score_0', 'F1-Score_1'])

# Create data frame to save prediction value:
sample_prediciton = test_df[['comment', 'global_id']+ columns].copy()

# Create directory to save confusion matrices if it doesn't exist
conf_matrix_dir = join(PROJECT_DIR, "Results/Confusion_matrix/XLM-RoBERTa")
if not os.path.exists(conf_matrix_dir):
    os.makedirs(conf_matrix_dir)

In [None]:
# Iterate through each column
for idx, column in enumerate(columns):
    print(f"Training on category: {column}")

    # Create datasets for the current category
    train_dataset = MyDataset(encoded_data_train, train_df[column].values)
    val_dataset = MyDataset(encoded_data_val, val_df[column].values)
    test_dataset = MyDataset(encoded_data_test, test_df[column].values)

    # Calculate class weights
    class_weights = calculate_class_weights(train_df[column].values)
    class_weights = class_weights.to(device)

    # Load model
    model = (AutoModelForSequenceClassification
             .from_pretrained(model_ckpt, num_labels=2)
             .to(device))

    # Define training arguments
    batch_size = 64
    logging_steps = len(train_dataset) // batch_size

    model_name = f"{model_ckpt}-finetuned-{column}"
    training_args = TrainingArguments(
        output_dir=model_name,
        num_train_epochs=5,
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        weight_decay=0.01,
        eval_strategy="epoch",
        disable_tqdm=False,
        logging_steps=logging_steps,
        push_to_hub=False,
        log_level="error"
    )

    # Initialize Trainer with custom weighted loss
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        weight=class_weights
    )

    # Train the model
    trainer.train()

    # Evaluate on validation dataset
    preds_output = trainer.predict(val_dataset)
    metrics = preds_output.metrics

    # Predict on test dataset
    pred = np.argmax(trainer.predict(test_dataset).predictions, axis=1)
    report = classification_report(test_df[column], pred, output_dict=True, zero_division=1)

    # Collect metrics
    metrics_row = pd.DataFrame([{
        'Category': column,
        'Accuracy': report['accuracy'],
        'F1-Score': report['weighted avg']['f1-score'],
        'F1-Score_0': report['0']['f1-score'],
        'F1-Score_1': report['1']['f1-score']
    }])

    metrics_df = pd.concat([metrics_df, metrics_row], ignore_index=True)

    # Save sample prediction to file
    sample_prediciton[f'pred_{column}'] = pred

    # Push the model to the hub
    model_name = f"XLM-RoBERTa_motive_topic_{column}"
    trainer.push_to_hub(model_name)

    conf_matrix = confusion_matrix(test_df[column], pred)

    # Plot confusion matrix
    conf_matrix = confusion_matrix(test_df[column], pred)
    # Calculate percentages
    total = np.sum(conf_matrix)
    percentages = (conf_matrix / total * 100).flatten()

    # Combine counts and percentages in the labels
    labels = np.array(["{0}\n{1:.1f}%".format(count, pct) if pct != 0 else "{0}\n0%".format(count)
                      for count, pct in zip(conf_matrix.flatten(), percentages)]).reshape(2, 2)

    # Create the heatmap
    plt.figure(figsize=(5, 5))
    ax = sns.heatmap(conf_matrix, annot=labels, fmt='', cmap='PuBu',  cbar=False,
                    xticklabels=['No', 'Yes'],
                    yticklabels=['No', 'Yes'])
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0, va='center', ha='center', fontsize=9)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=90, va='center', ha='center', fontsize=9)

    # Title and labels
    plt.title(f'XLM-RoBERTa: {confusion_matrix_title[idx]}', fontsize=10, fontweight='bold')
    plt.xlabel('Predicted Label', fontweight='bold')
    plt.ylabel('True Label', fontweight='bold')

    # Add a border around the heatmap
    for _, spine in ax.spines.items():
        spine.set_visible(True)
        spine.set_linewidth(0.35)
        spine.set_color('black')
    plt.savefig(os.path.join(conf_matrix_dir, f'confusion_matrix_{column}.png'))
    plt.close()

Training on category: emo_une


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6981,0.647407,0.565,0.593748
2,0.6379,0.54727,0.765,0.777799
3,0.5064,0.469615,0.805,0.810565
4,0.4314,0.458369,0.825,0.832531
5,0.416,0.450161,0.84,0.844127


events.out.tfevents.1719371962.f2cb808dedd8.367.2:   0%|          | 0.00/8.14k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

In [None]:
# Calculate average metrics for the current model
avg_accuracy_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['Accuracy'].mean()
avg_f1_score_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score'].mean()
avg_f1_score_0_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score_0'].mean()
avg_f1_score_1_t = metrics_df[metrics_df['Category'].str.startswith('t_')]['F1-Score_1'].mean()

# Append average metrics to the DataFrame for the current model
avg_metrics_row = pd.DataFrame([{
    'Category': 'Avg_t_Columns',
    'Accuracy': avg_accuracy_t,
    'F1-Score': avg_f1_score_t,
    'F1-Score_0': avg_f1_score_0_t,
    'F1-Score_1': avg_f1_score_1_t
}])

metrics_df = pd.concat([metrics_df, avg_metrics_row], ignore_index=True)
metrics_df.to_csv(join(PROJECT_DIR, "Results/metrics/XLM-RoBERTa.csv"))

In [None]:
metrics_df

Save sample prediction:

In [None]:
sample_prediciton.to_csv(join(PROCESSED_DIR, "SamplePrediction/XLM-RoBERTA_sample_predicted_final.csv"), index=False)

# Prediction

Load data

In [None]:
drive.mount('/content/drive')
PROJECT_DIR = "/content/drive/MyDrive/Thesis/Data"
PROCESSED_DIR = join(PROJECT_DIR, "Processed")

Mounted at /content/drive


In [None]:
df_predict = pd.read_csv(join(PROCESSED_DIR, "data_for_prediction.csv"), encoding='latin1', low_memory=False)

feedback_for_prediction = df_predict[(df_predict['dataset']=='ab') & (df_predict['symbols_only']=='no')]
feedback_for_prediction = feedback_for_prediction.drop(columns=['symbols_only', 'dataset','buyer'])
feedback_for_prediction = feedback_for_prediction.dropna(subset=['category'])

In [None]:
feedback_for_prediction.head()

Unnamed: 0,seller,date_left,comment,itemid,price,rating,category,global_id
178101,HumboldtFarms,2016-07-15,"Amazing product, awesome stealth; 5dd; 2 pills...",67532.0,72.0,Positive,Cannabis & Hashish,ao977151
178102,HumboldtFarms,2017-04-10,Fast Shipping!!!,162780.0,45.0,Positive,Cannabis & Hashish,ao972046
178103,HumboldtFarms,2017-01-15,FE'd,30011.0,31.0,Positive,Cannabis & Hashish,ao1004619
178104,HumboldtFarms,2015-11-29,Great quality and a great strain. Came overwei...,50252.0,97.0,Positive,Cannabis & Hashish,ao996416
178105,HumboldtFarms,2016-06-08,"Love these, best way to smoke in public",30009.0,167.0,Positive,Cannabis & Hashish,ao980512


In [None]:
feedback_for_prediction.shape

(1217944, 8)

Load pretrained models

In [None]:
# load models for topic and motives
model_names = [
    't_communication',
    't_payment',
    't_refund',
    't_price',
    't_value',
    't_shipping',
    't_product',
    't_feedback',
    't_vendor',
    't_generic',
    't_overall',
    'emo_une'
]

for model_name in model_names:
    var_name = f"{model_name}_pipeline"
    model_path = f"Gregorig/deberta-v3-base-finetuned-{model_name}"
    globals()[var_name] = pipeline("text-classification", model=model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

 Make a loop that will classify each text for all categories

Ensure all models are in evaluation mode and move to GPU

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# List of all pipelines and their corresponding column names
pipelines = [
      (emo_une_pipeline, 'emo_une'),
      (t_communication_pipeline, 't_communication'),
      (t_payment_pipeline, 't_payment'),
      (t_refund_pipeline, 't_refund'),
      (t_price_pipeline, 't_price'),
      (t_value_pipeline, 't_value'),
      (t_shipping_pipeline, 't_shipping'),
      (t_product_pipeline, 't_product'),
      (t_feedback_pipeline, 't_feedback'),
      (t_vendor_pipeline, 't_vendor'),
      (t_generic_pipeline, 't_generic'),
      (t_overall_pipeline, 't_overall')
]

for pipeline, _ in pipelines:
    pipeline.model.to(device)
    pipeline.model.eval()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Process all comments for each pipeline


In [None]:
def process_comments_in_batches(pipeline, comments, batch_size, device, desc):
    results = []
    for i in tqdm(range(0, len(comments), batch_size), desc=desc):
        batch = comments[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = pipeline.model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            results.extend(preds)

    return results

feedback_pred_list = feedback_for_prediction['comment'].tolist()

for pipeline, column_name in pipelines:
    print(f"Starting processing for {column_name} model")
    feedback_for_prediction[column_name] = process_comments_in_batches(pipeline, feedback_pred_list, batch_size=512, device=device, desc=f"Processing comments for {column_name} model")

    # Save intermediate results by overwriting the existing file
    feedback_for_prediction.to_csv(join(PROCESSED_DIR, "ab_predicted_final.csv"), index=False)
    print(f"Finished processing for {column_name} model and saved results.")


Starting processing for emo_une model


Processing comments for emo_une model:   0%|          | 0/2379 [00:00<?, ?it/s]

Finished processing for emo_une model and saved results.
