# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


import torch
from torch.utils.data import DataLoader, Dataset

from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments




# Functions

### Read Dataset Function

In [2]:
# Read a single Dataset File
def read_dataset(file_path):
    if file_path.lower().endswith('.csv'):
        dataset = pd.read_csv(file_path)
    elif file_path.lower().endswith('.xlsx'):
        dataset = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a .csv or .xlsx file.")
    
    dataset   = np.array(dataset)
    data_train, data_test     = train_test_split(dataset, test_size=0.2, random_state=100)

    x_train, y_train   = (data_train[:,:-1]).astype(str).tolist(), (data_train[:,-1]).astype("int32").tolist()
    x_test, y_test     = (data_test[:,:-1]).astype(str).tolist(), (data_test[:,-1]).astype("int32").tolist()           
    #x_train, x_test    = x_train.squeeze(), x_test.squeeze()

    return x_train, x_test, y_train, y_test


# If you have Train and Test Datasets separate
def read_train_test_dataset(train_data, test_data):
    if train_data.lower().endswith('.csv') and test_data.lower().endswith('.csv'):
        train_data = pd.read_csv(train_data)
        test_data = pd.read_csv(test_data)
    elif train_data.lower().endswith('.xlsx') and test_data.lower().endswith('.xlsx'):
        train_data = pd.read_excel(train_data)
        test_data = pd.read_excel(test_data)
    else:
        raise ValueError("Unsupported file format. Please provide a .csv or .xlsx file.")
    
    train_data, test_data   = np.array(train_data), np.array(test_data)

    x_train, y_train   = (train_data[:,:-1]).astype(str).tolist(), (train_data[:,-1]).astype("int32").tolist()
    x_test, y_test     = (test_data[:,:-1]).astype(str).tolist(), (test_data[:,-1]).astype("int32").tolist()           
    #x_train, x_test    = x_train.squeeze(), x_test.squeeze()

    return x_train, x_test, y_train, y_test

### Tokenization Function

In [3]:
def tokenizer(train_texts, test_texts):
    #Using DistilBert Pre-trained Model
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

    train_encodings = tokenizer([text[0] for text in train_texts], truncation=True, padding=True)
    test_encodings  = tokenizer([text[0] for text in test_texts], truncation=True, padding=True)

    return train_encodings, test_encodings

def test_tokenizer(testdata):
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    test_encodings = tokenizer([text[0] for text in testdata], truncation=True, padding=True)
    return test_encodings


### Custom Dataset Function


In [4]:
class HumourDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.LongTensor([self.labels[idx]])  # Convert to LongTensor

        return item  

### Trainer Function

In [5]:
def train_parameters(output_dir, logging_dir, num_labels, train_dataset, eval_dataset):
    #Check if GPU is available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Training on: {device}")
    
    training_args = TrainingArguments(
        output_dir=output_dir,                # output directory
        num_train_epochs=5,              
        per_device_train_batch_size=8,        # batch size per device during training
        per_device_eval_batch_size=64,        # batch size for evaluation
        warmup_steps=500,                     # number of warmup steps for learning rate scheduler
        weight_decay=0.01,                    # strength of weight decay
        logging_dir=logging_dir,              # directory for storing logs
        logging_steps=10,
    )

    model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
    )

    trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=eval_dataset            # evaluation dataset
    )

    return model, trainer


### Evaluation Function

In [6]:
def evaluate_model(trained_model, custom_x_test):
    
    # Make predictions on the validation dataset
    predictions = trained_model.predict(custom_x_test)
    predictions = predictions.predictions.argmax(axis=1)

    # Extract labels from the validation dataset
    labels = custom_x_test.labels
    report = classification_report(labels, predictions)

    return predictions, report, labels 


### Cross-Validation Function

In [7]:
def cross_validate(model_name, logging_dir, num_labels, x_data, y_data, n_splits=5):
    # Initialization and KFold Setup
    kf = KFold(n_splits=n_splits)
    fold = 0
    all_predictions = []
    all_labels = []
    all_reports = []

    # Splitting the Data
    for train_index, test_index in kf.split(x_data):
        fold += 1
        print(f"Training fold {fold}/{n_splits}")

        x_train, x_test = x_data[train_index], x_data[test_index]
        y_train, y_test = y_data[train_index], y_data[test_index]

        # Tokenizing the Data
        train_encodings, test_encodings = tokenizer(x_train, x_test)

        # Creating Pytorch Datasets
        train_dataset = HumourDataset(train_encodings, y_train)
        test_dataset = HumourDataset(test_encodings, y_test)

        # Training the Model
        output_dir = f"{model_name}_fold_{fold}"
        model, trainer = train_parameters(output_dir, logging_dir, num_labels, train_dataset, test_dataset)
        trainer.train()

        #Evaluating the Model
        predictions, report, labels = evaluate_model(trainer, test_dataset)
        all_predictions.extend(predictions)
        all_labels.extend(y_test)
        all_reports.append(report)
    
    return all_predictions, all_labels, all_reports


### Save Model results Function

In [8]:
def save_results(true_label, predicted):
    report_dict = classification_report(true_label,predicted,output_dict=True)

    # Save Result Report
    save_report = pd.DataFrame(report_dict).transpose()  # Convert the report dictionary to a DataFrame
    save_report = save_report.round(3)                   # Round the values to a specific number of decimal places
    save_report = save_report.astype({'support': int})   # Convert the 'support' column to integers
    save_report.loc['accuracy', ['precision', 'recall', 'support']] = [None, None, None] # Set the accuracy row to None

    return save_report

### Save the Trained Model

In [9]:
def save_trained_model(model):
    return torch.save(model.state_dict(), 'model.pth')

### Affiliative and Aggressive Dataset Seperation

In [10]:
def aff_agg_data_seperation(x_train, x_test, y_train, y_test):
    # get Train and test set for aff agg seperation
    x_train    = np.array(x_train);  y_train = np.array(y_train)
    d2_y_data  = np.expand_dims(y_train,axis=1) #add dimension  # np.squeeze() reduce dimension
    train_1463 = np.concatenate((x_train,d2_y_data), axis=1)

    x_test     = np.array(x_test);  y_test = np.array(y_test)
    d2_y_test  = np.expand_dims(y_test,axis=1) #add dimension  # np.squeeze() reduce dimension
    test_1463 = np.concatenate((x_test,d2_y_test), axis=1)

    train_1463 = pd.DataFrame(train_1463, columns=(["jokes", "labels"]))
    test_1463  = pd.DataFrame(test_1463, columns=(["jokes", "labels"]))

    return train_1463, test_1463

def select_wanted_labels(train_data, test_data):
    train = []
    test  = []
    train_data = pd.read_csv(train_data)
    test_data  = pd.read_csv(test_data)
    train_data = np.array(train_data);  test_data = np.array(test_data)

    for example in train_data:
        if int(example[-1]) ==2 or int(example[-1]) ==3:
            train.append(example)    

    for example in test_data:
        if int(example[-1]) ==2 or int(example[-1]) ==3:
            test.append(example)    
    return train,test

# Example Usage
train_path = "datasets/train_1463.csv"
test_path  = "datasets/test_1463.csv"

training, testing  = select_wanted_labels(train_path, test_path)
train_1463 = pd.DataFrame(training, columns=(["jokes", "labels"]))
test_1463  = pd.DataFrame(testing, columns=(["jokes", "labels"]))

# Replace all occurrences of 2 with 0 in 'labels'
train_1463['labels'] = train_1463['labels'].replace(2, 0); train_1463['labels'] = train_1463['labels'].replace(3, 1)
test_1463['labels']  = test_1463['labels'].replace(2, 0);  test_1463['labels']  = test_1463['labels'].replace(3, 1)
train_1463.to_csv("datasets/af_ag_train_1463.csv", index=False)
test_1463.to_csv("datasets/af_ag_test_1463.csv", index=False)

# Five Class Classification

#### Cross-validation Usage

In [11]:
#Cross-Validation
humor_5class_path = "datasets/Humour_style.xlsx"
x_train_5, x_test_5, y_train_5, y_test_5 = read_dataset(humor_5class_path)

x_data = np.array(x_train_5)
y_data = np.array(y_train_5)

num_labels = 5
len(x_test_5)

293

In [13]:
from sklearn.metrics import confusion_matrix
#cross_validation_conMatrix = confusion_matrix(labels, predictions)
#cross_validation_conMatrix

#### Final Model Training

In [14]:
# Train final model on the entire dataset and evaluate on the test dataset
train_encodings5, test_encodings5 = tokenizer(x_train_5, x_test_5)
train_dataset5 = HumourDataset(train_encodings5, y_train_5)
test_dataset5  = HumourDataset(test_encodings5, y_test_5)  
output_dir  = 'DistilBERT_Models/distilBERT_5final'
logging_dir = 'DistilBERT_Models/distilBERT_logs_5final'
model, trainer = train_parameters(output_dir, logging_dir, num_labels, train_dataset5, test_dataset5)
trainer.train()  

Training on: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/735 [00:00<?, ?it/s]

{'loss': 1.6293, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.07}
{'loss': 1.624, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.14}
{'loss': 1.6243, 'learning_rate': 3e-06, 'epoch': 0.2}
{'loss': 1.6074, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.27}
{'loss': 1.595, 'learning_rate': 5e-06, 'epoch': 0.34}
{'loss': 1.5778, 'learning_rate': 6e-06, 'epoch': 0.41}
{'loss': 1.5527, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.48}
{'loss': 1.5259, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.54}
{'loss': 1.5047, 'learning_rate': 9e-06, 'epoch': 0.61}
{'loss': 1.4584, 'learning_rate': 1e-05, 'epoch': 0.68}
{'loss': 1.3555, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.75}
{'loss': 1.3375, 'learning_rate': 1.2e-05, 'epoch': 0.82}
{'loss': 1.1792, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.88}
{'loss': 1.1809, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.95}
{'loss': 1.0963, 'learning_rate': 1.5e-05, 'epoch': 1.02}
{'loss': 0.9722, 'learning

Checkpoint destination directory DistilBERT_Models/distilBERT_5final\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.292, 'learning_rate': 5e-05, 'epoch': 3.4}
{'loss': 0.3192, 'learning_rate': 4.787234042553192e-05, 'epoch': 3.47}
{'loss': 0.5123, 'learning_rate': 4.574468085106383e-05, 'epoch': 3.54}
{'loss': 0.4286, 'learning_rate': 4.3617021276595746e-05, 'epoch': 3.61}
{'loss': 0.4066, 'learning_rate': 4.148936170212766e-05, 'epoch': 3.67}
{'loss': 0.397, 'learning_rate': 3.936170212765958e-05, 'epoch': 3.74}
{'loss': 0.3332, 'learning_rate': 3.723404255319149e-05, 'epoch': 3.81}
{'loss': 0.2339, 'learning_rate': 3.5106382978723407e-05, 'epoch': 3.88}
{'loss': 0.2273, 'learning_rate': 3.2978723404255317e-05, 'epoch': 3.95}
{'loss': 0.226, 'learning_rate': 3.085106382978723e-05, 'epoch': 4.01}
{'loss': 0.0606, 'learning_rate': 2.8723404255319154e-05, 'epoch': 4.08}
{'loss': 0.2074, 'learning_rate': 2.6595744680851064e-05, 'epoch': 4.15}
{'loss': 0.1014, 'learning_rate': 2.446808510638298e-05, 'epoch': 4.22}
{'loss': 0.1397, 'learning_rate': 2.2340425531914894e-05, 'epoch': 4.29}
{'loss

TrainOutput(global_step=735, training_loss=0.6898482188176946, metrics={'train_runtime': 155.8926, 'train_samples_per_second': 37.526, 'train_steps_per_second': 4.715, 'train_loss': 0.6898482188176946, 'epoch': 5.0})

#### Final model Evaluation

In [15]:
# Evaluate the final model on the Test dataset
predictions5, result5, labels5 = evaluate_model(trainer, test_dataset5) 
print(result5)

# Save Model 
save_trained_model(model)
model.save_pretrained('DistilBERT_Models/SavedModel_5classes/')

  0%|          | 0/5 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.75      0.92      0.83        60
           1       0.74      0.78      0.76        45
           2       0.68      0.65      0.66        62
           3       0.76      0.71      0.73        58
           4       0.93      0.82      0.87        68

    accuracy                           0.77       293
   macro avg       0.77      0.77      0.77       293
weighted avg       0.78      0.77      0.77       293



In [40]:
new_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
new_tokenizer.save_pretrained('DistilBERT_Models/Tokenizer_5classes/')


('DistilBERT_Models/Tokenizer_5classes/tokenizer_config.json',
 'DistilBERT_Models/Tokenizer_5classes/special_tokens_map.json',
 'DistilBERT_Models/Tokenizer_5classes/vocab.txt',
 'DistilBERT_Models/Tokenizer_5classes/added_tokens.json',
 'DistilBERT_Models/Tokenizer_5classes/tokenizer.json')

In [16]:
test_conMatrix = confusion_matrix(labels5, predictions5)
test_conMatrix

array([[55,  4,  1,  0,  0],
       [ 5, 35,  0,  4,  1],
       [10,  5, 40,  6,  1],
       [ 3,  2, 10, 41,  2],
       [ 0,  1,  8,  3, 56]], dtype=int64)

In [17]:
# Save Report and write to CSV
distilBERT_result_5= save_results(y_test_5,predictions5)
distilBERT_result_5.to_csv('models_results/distilBERT_5classes.csv', index=False)

# Four Class Classification

#### Cross validation Usage

In [18]:
# Example Usage for Cross-Validation
humor_4class_path = "datasets/Humour_style_4classes.xlsx" 
x_train_4, x_test_4, y_train_4, y_test_4 = read_dataset(humor_4class_path)

x_data4 = np.array(x_train_4)
y_data4 = np.array(y_train_4)

num_labels4= 4

In [19]:
#predictions4, labels4, reports4 = cross_validate('DistilBERT_Models/distilBERT_4classes', 'DistilBERT_Models/distilBERT_logs_4classes', num_labels4, x_data4, y_data4)

# Save the cross-validation results
#final_report4 = save_results(labels4, predictions4)
#final_report4.to_csv('cross_validation_results/distilBERT_4classes.csv', index=True)
#print(final_report4) 

#### Final Training and Evaluation

In [20]:
# Example Usage
train_encodings4, test_encodings4 = tokenizer(x_train_4, x_test_4)
train_dataset4 = HumourDataset(train_encodings4, y_train_4)
test_dataset4  = HumourDataset(test_encodings4, y_test_4)  
output_dir4  = 'DistilBERT_Models/distilBERT_4classes'
logging_dir4 = 'DistilBERT_Models/distilBERT_logs_4classes'
num_labels4= 4
model4, trainer4 = train_parameters(output_dir4, logging_dir4, num_labels4, train_dataset4, test_dataset4)
trainer4.train()  

Training on: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/735 [00:00<?, ?it/s]

{'loss': 1.4016, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.07}
{'loss': 1.377, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.14}
{'loss': 1.4026, 'learning_rate': 3e-06, 'epoch': 0.2}
{'loss': 1.391, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.27}
{'loss': 1.345, 'learning_rate': 5e-06, 'epoch': 0.34}
{'loss': 1.3434, 'learning_rate': 6e-06, 'epoch': 0.41}
{'loss': 1.3274, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.48}
{'loss': 1.3083, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.54}
{'loss': 1.2608, 'learning_rate': 9e-06, 'epoch': 0.61}
{'loss': 1.1827, 'learning_rate': 1e-05, 'epoch': 0.68}
{'loss': 1.0956, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.75}
{'loss': 1.0676, 'learning_rate': 1.2e-05, 'epoch': 0.82}
{'loss': 0.9518, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.88}
{'loss': 0.8785, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.95}
{'loss': 0.8851, 'learning_rate': 1.5e-05, 'epoch': 1.02}
{'loss': 0.7282, 'learning_

Checkpoint destination directory DistilBERT_Models/distilBERT_4classes\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.1334, 'learning_rate': 5e-05, 'epoch': 3.4}
{'loss': 0.2118, 'learning_rate': 4.787234042553192e-05, 'epoch': 3.47}
{'loss': 0.4177, 'learning_rate': 4.574468085106383e-05, 'epoch': 3.54}
{'loss': 0.4331, 'learning_rate': 4.3617021276595746e-05, 'epoch': 3.61}
{'loss': 0.4618, 'learning_rate': 4.148936170212766e-05, 'epoch': 3.67}
{'loss': 0.2501, 'learning_rate': 3.936170212765958e-05, 'epoch': 3.74}
{'loss': 0.3192, 'learning_rate': 3.723404255319149e-05, 'epoch': 3.81}
{'loss': 0.1795, 'learning_rate': 3.5106382978723407e-05, 'epoch': 3.88}
{'loss': 0.2005, 'learning_rate': 3.2978723404255317e-05, 'epoch': 3.95}
{'loss': 0.2668, 'learning_rate': 3.085106382978723e-05, 'epoch': 4.01}
{'loss': 0.0646, 'learning_rate': 2.8723404255319154e-05, 'epoch': 4.08}
{'loss': 0.1874, 'learning_rate': 2.6595744680851064e-05, 'epoch': 4.15}
{'loss': 0.1186, 'learning_rate': 2.446808510638298e-05, 'epoch': 4.22}
{'loss': 0.1348, 'learning_rate': 2.2340425531914894e-05, 'epoch': 4.29}
{'l

TrainOutput(global_step=735, training_loss=0.547748986753274, metrics={'train_runtime': 154.6499, 'train_samples_per_second': 37.827, 'train_steps_per_second': 4.753, 'train_loss': 0.547748986753274, 'epoch': 5.0})

In [21]:
# Save and Evaluate Model 
predictions4, result4, labels4 = evaluate_model(trainer4, test_dataset4) 

model4.save_pretrained('DistilBERT_Models/SavedModel_4finalmodel/')
print(result4)

# Save Report
distilBERT_result_4= save_results(y_test_4,predictions4)

# Confusion matrix
test_conMatrix4 = confusion_matrix(labels4, predictions4)
test_conMatrix4

  0%|          | 0/5 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.74      0.88      0.80        60
           1       0.84      0.69      0.76        45
           2       0.84      0.82      0.83       120
           3       0.89      0.87      0.88        68

    accuracy                           0.83       293
   macro avg       0.83      0.82      0.82       293
weighted avg       0.83      0.83      0.83       293



array([[53,  3,  4,  0],
       [ 6, 31,  7,  1],
       [13,  2, 99,  6],
       [ 0,  1,  8, 59]], dtype=int64)

In [22]:
# Save Report to CSV
distilBERT_result_4.to_csv('models_results/distilBERT_4classes.csv', index=True)

# Two Class Classification

#### Cross validation Usage

In [23]:
# Example Usage for Cross-Validation
# train_2class_path = "datasets/af_ag_train.xlsx" ; test_2class_path  = "datasets/af_ag_test.xlsx" 
train_2class_path = "datasets/af_ag_train_1463.csv" ; test_2class_path  = "datasets/af_ag_test_1463.csv" 
x_train_2, x_test_2, y_train_2, y_test_2 = read_train_test_dataset(train_2class_path, test_2class_path)

x_data2 = np.array(x_train_2)
y_data2 = np.array(y_train_2)

num_labels2= 2

In [24]:
#predictions2, labels2, reports2 = cross_validate('DistilBERT_Models/distilBERT_2classes', 'DistilBERT_Models/distilBERT_logs_2classes', num_labels2, x_data2, y_data2)

# Save the cross-validation results
#final_report2 = save_results(labels2, predictions2)
#final_report2.to_csv('cross_validation_results/distilBERT_2classes.csv', index=True)
#print(final_report2) 

#### Final Training of two class model

In [25]:
# Example Usage
train_encodings2, test_encodings2 = tokenizer(x_train_2, x_test_2)
train_dataset2 = HumourDataset(train_encodings2, y_train_2)
test_dataset2  = HumourDataset(test_encodings2, y_test_2)  
output_dir2  = 'DistilBERT_Models/distilBERT_2classes'
logging_dir2 = 'DistilBERT_Models/distilBERT_logs_2classes'
model2, trainer2 = train_parameters(output_dir2, logging_dir2, num_labels2, train_dataset2, test_dataset2)
trainer2.train()  

Training on: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/280 [00:00<?, ?it/s]

{'loss': 0.6918, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.18}
{'loss': 0.6861, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.36}
{'loss': 0.69, 'learning_rate': 3e-06, 'epoch': 0.54}
{'loss': 0.6879, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.71}
{'loss': 0.6937, 'learning_rate': 5e-06, 'epoch': 0.89}
{'loss': 0.6654, 'learning_rate': 6e-06, 'epoch': 1.07}
{'loss': 0.6704, 'learning_rate': 7.000000000000001e-06, 'epoch': 1.25}
{'loss': 0.6641, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.43}
{'loss': 0.6274, 'learning_rate': 9e-06, 'epoch': 1.61}
{'loss': 0.6478, 'learning_rate': 1e-05, 'epoch': 1.79}
{'loss': 0.6395, 'learning_rate': 1.1000000000000001e-05, 'epoch': 1.96}
{'loss': 0.5688, 'learning_rate': 1.2e-05, 'epoch': 2.14}
{'loss': 0.539, 'learning_rate': 1.3000000000000001e-05, 'epoch': 2.32}
{'loss': 0.4938, 'learning_rate': 1.4000000000000001e-05, 'epoch': 2.5}
{'loss': 0.4454, 'learning_rate': 1.5e-05, 'epoch': 2.68}
{'loss': 0.3763, 'learning_

TrainOutput(global_step=280, training_loss=0.46031890639236994, metrics={'train_runtime': 58.3511, 'train_samples_per_second': 38.388, 'train_steps_per_second': 4.799, 'train_loss': 0.46031890639236994, 'epoch': 5.0})

In [26]:
# Save and Evaluate Model 
model2.save_pretrained('DistilBERT_Models/SavedModel_2classes/')
predictions2, result2, labels2 = evaluate_model(trainer2, test_dataset2) 
print(result2)

  0%|          | 0/2 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.80      0.79      0.80        62
           1       0.78      0.79      0.79        58

    accuracy                           0.79       120
   macro avg       0.79      0.79      0.79       120
weighted avg       0.79      0.79      0.79       120



In [27]:
# Save Report
distilBERT_result_2= save_results(y_test_2,predictions2)
distilBERT_result_2.to_csv('models_results/distilBERT_2classes.csv', index=True)

# Individual Predictions

In [30]:
import torch
from transformers import DistilBertTokenizerFast
new_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_prediction(text):
    encoding = new_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    encoding = {k: v.to(device) for k,v in encoding.items()}  # Ensure tensors are on the correct device

    # Forward pass through the model
    outputs = model(**encoding)

    # Get the logits from the output
    logits = outputs.logits

    # Apply Softmax to get probabilities
    softmax = torch.nn.Softmax(dim=0)  # Assuming the logits are along dim=0 (num_labels)
    probs = softmax(logits.squeeze())

    # Convert probabilities to numpy array
    probs = probs.detach().cpu().numpy()

    # Get the label with highest probability
    label = np.argmax(probs)

    # Define the style based on the label
    if label == 0:
        return {
            'style': 'Self-enhancing',
            'probability': probs[0]
        }
    elif label == 1:
        return {
            'style': 'Self-deprecating',
            'probability': probs[1]
        }
    elif label == 2 and probs[2] > 0.65:
        return {
            'style': 'Affiliative',
            'probability': probs[2]
        }
    elif label == 2 and probs[2] <= 0.65:
        return {
            'style': 'Aggressive',
            'probability': probs[2]
        }
    elif label == 3:
        return {
            'style': 'Aggressive',
            'probability': probs[3]
        }
    else:
        return {
            'style': 'Neutral',
            'probability': probs[4]
        }

In [38]:
# Example usage:
print(get_prediction("Q: Why do Jewish men get circumcised? A: Because Jewish women won't touch anything unless it's 20 percent off."))
print(get_prediction("Q: Why are all black people fast? A: The slow ones are in jail."))
print(get_prediction("How can you tell a black guy has been on your computer? It's not there."))
print(get_prediction("Q: Why can't Mexicans play Uno? A: They always steal the green cards."))
print(get_prediction("Hitler calls a meeting of his best soldiers and commanders and tells them ,Alright I want to order the assassination of one thousand jews and four hedgehogs.Then one of his generals stands and says, But... Mein furhur why four hedgehogs? Hitler then smiles and says See? No one gives a f*ck about the jews."))
print(get_prediction("Mary is Fat"))

{'style': 'Aggressive', 'probability': 0.7611798}
{'style': 'Aggressive', 'probability': 0.9223928}
{'style': 'Aggressive', 'probability': 0.7374662}
{'style': 'Aggressive', 'probability': 0.56487465}
{'style': 'Affiliative', 'probability': 0.97560364}
{'style': 'Neutral', 'probability': 0.99481237}


# Two Model pipeline

In [32]:
import torch
from transformers import DistilBertTokenizerFast
new_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_prediction_2(text):
    encoding = new_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    encoding = {k: v.to(device) for k,v in encoding.items()}  # Ensure tensors are on the correct device

    # Forward pass through the model
    outputs = model2(**encoding)

    # Get the logits from the output
    logits = outputs.logits

    # Apply Softmax to get probabilities
    softmax = torch.nn.Softmax(dim=0)  # Assuming the logits are along dim=0 (num_labels)
    probs = softmax(logits.squeeze())

    # Convert probabilities to numpy array
    probs = probs.detach().cpu().numpy()

    # Get the label with highest probability
    label = np.argmax(probs)

    return label

In [33]:
print(type(predictions4))
model4_pred = predictions4
aff_agg_dataset = []
agg_agg_label = []

# Identify instances classified as affiliative/aggressive
aff_agg_mask = (model4_pred == 2)

for i in range(len(predictions4)):
    if predictions4[i] == 2:
        aff_agg_dataset.append(x_test_4[i])
        agg_agg_label.append(predictions4[i])

#for i in aff_agg_dataset:
#    i = str(i)
#    print(type(i))

<class 'numpy.ndarray'>


In [34]:
two_model_predictions = []
for i in aff_agg_dataset:
    i = str(i)
    two_model_predictions.append(get_prediction_2(i))

# Combine results
final_pred = [4 if pred == 3 else pred for pred in model4_pred]
model2_pred = [2 if p == 0 else 3 for p in two_model_predictions]

# Update final_pred with model2_pred results
pred2_index = 0
for i, mask in enumerate(aff_agg_mask):
    if mask:
        final_pred[i] = model2_pred[pred2_index]
        pred2_index += 1

In [35]:
print(two_model_predictions)
print(final_pred)

[0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1]
[2, 0, 1, 3, 2, 0, 4, 0, 2, 1, 1, 4, 4, 0, 1, 1, 1, 2, 0, 0, 0, 2, 4, 3, 2, 2, 0, 1, 3, 2, 4, 4, 4, 2, 0, 0, 0, 3, 4, 0, 1, 4, 1, 0, 2, 0, 4, 4, 3, 2, 2, 0, 0, 2, 2, 1, 0, 0, 4, 2, 4, 3, 3, 3, 2, 2, 4, 2, 0, 0, 0, 0, 4, 1, 0, 2, 1, 2, 1, 4, 4, 1, 1, 1, 3, 0, 1, 4, 3, 1, 1, 1, 2, 3, 0, 0, 2, 4, 0, 0, 2, 0, 2, 4, 4, 1, 3, 0, 0, 4, 2, 0, 2, 3, 2, 0, 3, 0, 4, 4, 0, 2, 4, 4, 3, 1, 2, 2, 2, 0, 3, 1, 0, 3, 4, 4, 2, 4, 3, 4, 2, 3, 4, 2, 1, 4, 0, 0, 2, 0, 4, 0, 1, 4, 4, 1, 2, 0, 1, 2, 0, 1, 4, 0, 1, 3, 4, 4, 3, 0, 3, 0, 3, 2, 2, 4, 2, 4, 0, 0, 3, 4, 2, 3, 3, 4, 2, 1, 2, 2, 0, 4, 4, 3, 2, 3, 0, 2, 0, 4, 0, 2, 0, 0, 0, 2, 4, 3, 1, 4, 3, 4, 2, 0, 2,

In [36]:
two_model_result= save_results(y_test_5,final_pred)

print(type(aff_agg_dataset))
print(type(x_test_2))
print(type(x_test_4))
print(two_model_result)

<class 'list'>
<class 'list'>
<class 'list'>
              precision  recall  f1-score  support
0                 0.736   0.883     0.803     60.0
1                 0.838   0.689     0.756     45.0
2                 0.569   0.661     0.612     62.0
3                 0.804   0.638     0.712     58.0
4                 0.894   0.868     0.881     68.0
accuracy            NaN     NaN     0.754      NaN
macro avg         0.768   0.748     0.753    293.0
weighted avg      0.767   0.754     0.755    293.0


In [37]:
two_model_result.to_csv("two_model_pipeline_results/distilBERT.csv", index=True)