# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report





# Functions

### Read Dataset Function

In [2]:
# Read a single Dataset File
def read_dataset(file_path):
    if file_path.lower().endswith('.csv'):
        dataset = pd.read_csv(file_path)
    elif file_path.lower().endswith('.xlsx'):
        dataset = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a .csv or .xlsx file.")
    
    dataset   = np.array(dataset)
    data_train, data_test     = train_test_split(dataset, test_size=0.2, random_state=100)

    x_train, y_train   = (data_train[:,:-1]).astype(str).tolist(), (data_train[:,-1]).astype("int32").tolist()
    x_test, y_test     = (data_test[:,:-1]).astype(str).tolist(), (data_test[:,-1]).astype("int32").tolist()           
    #x_train, x_test    = x_train.squeeze(), x_test.squeeze()

    return x_train, x_test, y_train, y_test


# If you have Train and Test Datasets separate
def read_train_test_dataset(train_data, test_data):
    if train_data.lower().endswith('.csv') and test_data.lower().endswith('.csv'):
        train_data = pd.read_csv(train_data)
        test_data = pd.read_csv(test_data)
    elif train_data.lower().endswith('.xlsx') and test_data.lower().endswith('.xlsx'):
        train_data = pd.read_excel(train_data)
        test_data = pd.read_excel(test_data)
    else:
        raise ValueError("Unsupported file format. Please provide a .csv or .xlsx file.")
    
    train_data, test_data   = np.array(train_data), np.array(test_data)

    x_train, y_train   = (train_data[:,:-1]).astype(str).tolist(), (train_data[:,-1]).astype("int32").tolist()
    x_test, y_test     = (test_data[:,:-1]).astype(str).tolist(), (test_data[:,-1]).astype("int32").tolist()           
    #x_train, x_test    = x_train.squeeze(), x_test.squeeze()

    return x_train, x_test, y_train, y_test

### Tokenization Function

In [3]:
def tokenizer(train_texts, test_texts):
    #Using DistilBert Pre-trained Model
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    train_encodings = tokenizer([text[0] for text in train_texts], truncation=True, padding=True)
    test_encodings  = tokenizer([text[0] for text in test_texts], truncation=True, padding=True)

    return train_encodings, test_encodings


### Custom Dataset Function


In [4]:
class HumourDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.LongTensor([self.labels[idx]])  # Convert to LongTensor

        return item  

### Trainer Function

In [19]:
def train_parameters(output_dir, logging_dir, num_labels, train_dataset, test_dataset):
    training_args = TrainingArguments(
        output_dir=output_dir,   # output directory
        num_train_epochs=5,              
        per_device_train_batch_size=8,       # batch size per device during training
        per_device_eval_batch_size=64,        # batch size for evaluation
        warmup_steps=500,                     # number of warmup steps for learning rate scheduler
        weight_decay=0.01,                    # strength of weight decay
        logging_dir=logging_dir,            # directory for storing logs
        logging_steps=10,
    )

    model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
    )

    trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=test_dataset            # evaluation dataset
    )

    return model, trainer


### Evaluation Function

In [6]:
def evaluate_model(trained_model, custom_x_test):
    trained_model.evaluate()
    # Make predictions on the validation dataset
    predictions = trained_model.predict(custom_x_test)
    predictions = predictions.predictions.argmax(axis=1)

    # Extract labels from the validation dataset
    labels = custom_x_test.labels
    report = classification_report(labels, predictions)

    return predictions, report 


### Save Model results Function

In [7]:
def save_results(true_label, predicted):
    report_dict = classification_report(true_label,predicted,output_dict=True)

    # Save Result Report
    save_report = pd.DataFrame(report_dict).transpose()  # Convert the report dictionary to a DataFrame
    save_report = save_report.round(3)                   # Round the values to a specific number of decimal places
    save_report = save_report.astype({'support': int})   # Convert the 'support' column to integers
    save_report.loc['accuracy', ['precision', 'recall', 'support']] = [None, None, None] # Set the accuracy row to None

    return save_report

# Five Class Classification

In [8]:
# Example Usage
humor_5class_path = "datasets/Humour_style.xlsx" 
x_train_5, x_test_5, y_train_5, y_test_5 = read_dataset(humor_5class_path)
train_encodings5, test_encodings5 = tokenizer(x_train_5, x_test_5)
train_dataset5 = HumourDataset(train_encodings5, y_train_5)
test_dataset5  = HumourDataset(test_encodings5, y_test_5)  
output_dir  = 'DistilBERT_Models/distilBERT_5classes'
logging_dir = 'DistilBERT_Models/distilBERT_logs_5classes'
num_labels= 5
model, trainer = train_parameters(output_dir, logging_dir, num_labels, train_dataset5, test_dataset5)
trainer.train()   

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/192 [00:00<?, ?it/s]

{'loss': 1.6175, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.31}
{'loss': 1.6167, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.62}
{'loss': 1.6015, 'learning_rate': 3e-06, 'epoch': 0.94}
{'loss': 1.596, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.25}
{'loss': 1.5793, 'learning_rate': 5e-06, 'epoch': 1.56}
{'loss': 1.5603, 'learning_rate': 6e-06, 'epoch': 1.88}
{'loss': 1.532, 'learning_rate': 7.000000000000001e-06, 'epoch': 2.19}
{'loss': 1.4593, 'learning_rate': 8.000000000000001e-06, 'epoch': 2.5}
{'loss': 1.3572, 'learning_rate': 9e-06, 'epoch': 2.81}
{'loss': 1.2215, 'learning_rate': 1e-05, 'epoch': 3.12}
{'loss': 1.0806, 'learning_rate': 1.1000000000000001e-05, 'epoch': 3.44}
{'loss': 0.9917, 'learning_rate': 1.2e-05, 'epoch': 3.75}
{'loss': 0.9669, 'learning_rate': 1.3000000000000001e-05, 'epoch': 4.06}
{'loss': 0.7603, 'learning_rate': 1.4000000000000001e-05, 'epoch': 4.38}
{'loss': 0.759, 'learning_rate': 1.5e-05, 'epoch': 4.69}
{'loss': 0.7044, 'learning_

TrainOutput(global_step=192, training_loss=1.1628583334386349, metrics={'train_runtime': 7152.6063, 'train_samples_per_second': 0.847, 'train_steps_per_second': 0.027, 'train_loss': 1.1628583334386349, 'epoch': 6.0})

In [9]:
# Save and Evaluate Model 
model.save_pretrained('DistilBERT_Models/SavedModel_5classes/')
predictions5, result5 = evaluate_model(trainer, test_dataset5) 
print(result5)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.73      0.97      0.83        59
           1       0.87      0.71      0.78        48
           2       0.67      0.32      0.43        44
           3       0.66      0.80      0.73        46
           4       0.95      1.00      0.97        56

    accuracy                           0.78       253
   macro avg       0.78      0.76      0.75       253
weighted avg       0.78      0.78      0.76       253



In [10]:
# Save Report
distilBERT_result_5= save_results(y_test_5,predictions5)
distilBERT_result_5.to_csv('models_results/distilBERT_5classes.csv', index=False)

# Four Class Classification

In [12]:
# Example Usage
humor_4class_path = "datasets/Humour_style_4classes.xlsx" 
x_train_4, x_test_4, y_train_4, y_test_4 = read_dataset(humor_4class_path)
train_encodings4, test_encodings4 = tokenizer(x_train_4, x_test_4)
train_dataset4 = HumourDataset(train_encodings4, y_train_4)
test_dataset4  = HumourDataset(test_encodings4, y_test_4)  
output_dir4  = 'DistilBERT_Models/distilBERT_4classes'
logging_dir4 = 'DistilBERT_Models/distilBERT_logs_4classes'
num_labels4= 4
model4, trainer4 = train_parameters(output_dir4, logging_dir4, num_labels4, train_dataset4, test_dataset4)
trainer4.train()  

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/192 [00:00<?, ?it/s]

{'loss': 1.4054, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.31}
{'loss': 1.3979, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.62}
{'loss': 1.3829, 'learning_rate': 3e-06, 'epoch': 0.94}
{'loss': 1.3568, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.25}
{'loss': 1.3377, 'learning_rate': 5e-06, 'epoch': 1.56}
{'loss': 1.2886, 'learning_rate': 6e-06, 'epoch': 1.88}
{'loss': 1.2337, 'learning_rate': 7.000000000000001e-06, 'epoch': 2.19}
{'loss': 1.1299, 'learning_rate': 8.000000000000001e-06, 'epoch': 2.5}
{'loss': 0.965, 'learning_rate': 9e-06, 'epoch': 2.81}
{'loss': 0.8672, 'learning_rate': 1e-05, 'epoch': 3.12}
{'loss': 0.7191, 'learning_rate': 1.1000000000000001e-05, 'epoch': 3.44}
{'loss': 0.6951, 'learning_rate': 1.2e-05, 'epoch': 3.75}
{'loss': 0.6336, 'learning_rate': 1.3000000000000001e-05, 'epoch': 4.06}
{'loss': 0.4676, 'learning_rate': 1.4000000000000001e-05, 'epoch': 4.38}
{'loss': 0.5455, 'learning_rate': 1.5e-05, 'epoch': 4.69}
{'loss': 0.4331, 'learnin

TrainOutput(global_step=192, training_loss=0.8859955507020155, metrics={'train_runtime': 5829.8027, 'train_samples_per_second': 1.039, 'train_steps_per_second': 0.033, 'train_loss': 0.8859955507020155, 'epoch': 6.0})

In [13]:
# Save and Evaluate Model 
model4.save_pretrained('DistilBERT_Models/SavedModel_4classes/')
predictions4, result4 = evaluate_model(trainer4, test_dataset4) 
print(result4)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.85      0.85      0.85        59
           1       0.86      0.67      0.75        48
           2       0.82      0.90      0.86        90
           3       0.97      1.00      0.98        56

    accuracy                           0.87       253
   macro avg       0.87      0.85      0.86       253
weighted avg       0.87      0.87      0.86       253



In [14]:
# Save Report
distilBERT_result_4= save_results(y_test_4,predictions4)
distilBERT_result_4.to_csv('models_results/distilBERT_4classes.csv', index=False)

# Two Class Classification

In [20]:
# Example Usage
# Read dataset 
train_2class_path = "datasets/af_ag_train.xlsx" 
test_2class_path  = "datasets/af_ag_test.xlsx" 
x_train_2, x_test_2, y_train_2, y_test_2 = read_train_test_dataset(train_2class_path, test_2class_path)

train_encodings2, test_encodings2 = tokenizer(x_train_2, x_test_2)
train_dataset2 = HumourDataset(train_encodings2, y_train_2)
test_dataset2  = HumourDataset(test_encodings2, y_test_2)  
output_dir2  = 'DistilBERT_Models/distilBERT_2classes'
logging_dir2 = 'DistilBERT_Models/distilBERT_logs_2classes'
num_labels2= 2
model2, trainer2 = train_parameters(output_dir2, logging_dir2, num_labels2, train_dataset2, test_dataset2)
trainer2.train()  

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/245 [00:00<?, ?it/s]

{'loss': 0.719, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.2}
{'loss': 0.7139, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.41}
{'loss': 0.7177, 'learning_rate': 3e-06, 'epoch': 0.61}
{'loss': 0.7009, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.82}
{'loss': 0.6845, 'learning_rate': 5e-06, 'epoch': 1.02}
{'loss': 0.6695, 'learning_rate': 6e-06, 'epoch': 1.22}
{'loss': 0.6688, 'learning_rate': 7.000000000000001e-06, 'epoch': 1.43}
{'loss': 0.6387, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.63}
{'loss': 0.6104, 'learning_rate': 9e-06, 'epoch': 1.84}
{'loss': 0.5901, 'learning_rate': 1e-05, 'epoch': 2.04}
{'loss': 0.5448, 'learning_rate': 1.1000000000000001e-05, 'epoch': 2.24}
{'loss': 0.5418, 'learning_rate': 1.2e-05, 'epoch': 2.45}
{'loss': 0.5187, 'learning_rate': 1.3000000000000001e-05, 'epoch': 2.65}
{'loss': 0.4754, 'learning_rate': 1.4000000000000001e-05, 'epoch': 2.86}
{'loss': 0.4006, 'learning_rate': 1.5e-05, 'epoch': 3.06}
{'loss': 0.3862, 'learnin

TrainOutput(global_step=245, training_loss=0.4612898773076583, metrics={'train_runtime': 6441.1833, 'train_samples_per_second': 0.299, 'train_steps_per_second': 0.038, 'train_loss': 0.4612898773076583, 'epoch': 5.0})

In [21]:
# Save and Evaluate Model 
model2.save_pretrained('DistilBERT_Models/SavedModel_2classes/')
predictions2, result2 = evaluate_model(trainer2, test_dataset2) 
print(result2)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.93      0.64      0.76        44
           1       0.73      0.96      0.83        46

    accuracy                           0.80        90
   macro avg       0.83      0.80      0.79        90
weighted avg       0.83      0.80      0.79        90



In [22]:
# Save Report
distilBERT_result_2= save_results(y_test_2,predictions2)
distilBERT_result_2.to_csv('models_results/distilBERT_2classes.csv', index=False)