# Import Libraries

In [1]:
from FlagEmbedding import FlagModel
from sentence_transformers import SentenceTransformer,util

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt




# Functions

### Read datasets function

In [2]:
# Read a single Dataset File
def read_dataset(file_path):
    if file_path.lower().endswith('.csv'):
        dataset = pd.read_csv(file_path)
    elif file_path.lower().endswith('.xlsx'):
        dataset = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a .csv or .xlsx file.")
    
    dataset   = np.array(dataset)
    data_train, data_test     = train_test_split(dataset, test_size=0.2, random_state=100)

    x_train, y_train   = (data_train[:,:-1]), (data_train[:,-1]).astype("int32")
    x_test, y_test     = (data_test[:,:-1]), (data_test[:,-1]).astype("int32")           
    x_train, x_test    = x_train.squeeze(), x_test.squeeze()

    return x_train, x_test, y_train, y_test


# If you have Train and Test Datasets separate
def read_train_test_dataset(train_data, test_data):
    if train_data.lower().endswith('.csv') and test_data.lower().endswith('.csv'):
        train_data = pd.read_csv(train_data)
        test_data = pd.read_csv(test_data)
    elif train_data.lower().endswith('.xlsx') and test_data.lower().endswith('.xlsx'):
        train_data = pd.read_excel(train_data)
        test_data = pd.read_excel(test_data)
    else:
        raise ValueError("Unsupported file format. Please provide a .csv or .xlsx file.")
    
    train_data, test_data   = np.array(train_data), np.array(test_data)

    x_train, y_train   = (train_data[:,:-1]), (train_data[:,-1]).astype("int32")
    x_test, y_test     = (test_data[:,:-1]), (test_data[:,-1]).astype("int32")           
    x_train, x_test    = x_train.squeeze(), x_test.squeeze()

    return x_train, x_test, y_train, y_test

### BGE Embedding
Note: to make use of any of the embedding models you just have to uncomment the name

In [3]:
def BGE(sentences):
    dimensions = 512
    #model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')  # 17th Multilingual-E5-large-instruct
    #model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=dimensions) # 12th
    model = SentenceTransformer('BAAI/bge-large-zh-v1.5')  # 20
    dataset_embedding = model.encode(sentences, normalize_embeddings=True)

    return dataset_embedding

### GTE Embedding
Note: to make use of any of the embedding models you just have to uncomment the name

In [4]:
def GTE(sentences):
    model = SentenceTransformer("WhereIsAI/UAE-Large-V1")  # 13th UAE
    #model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True) # 9 MTEB rank
    # model = SentenceTransformer('thenlper/gte-large')  # 29 MTEB Rank
    dataset_embedding = model.encode(sentences, normalize_embeddings=True)

    return dataset_embedding

### Save model results

In [5]:
def save_results(true_label, predicted):
    report_dict = classification_report(true_label,predicted,output_dict=True)

    # Save Result Report
    save_report = pd.DataFrame(report_dict).transpose()  # Convert the report dictionary to a DataFrame
    save_report = save_report.round(3)                   # Round the values to a specific number of decimal places
    save_report = save_report.astype({'support': int})   # Convert the 'support' column to integers
    save_report.loc['accuracy', ['precision', 'recall', 'support']] = [None, None, None] # Set the accuracy row to None

    return save_report

## Classifiers 

### Random Forest 

In [6]:
def randomforest(x_train, y_train):
    # Create RandomForestClassifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    # Perform cross-validation
    cv_scores = cross_val_score(rf_classifier, x_train, y_train, cv=10)  # 10-fold cross-validation

    # Train the model
    rf_classifier.fit(x_train, y_train)

    return rf_classifier, cv_scores.mean()

### XGBoost 

In [7]:
def XGBoost(x_train, y_train, num_class):
    # Create an XGBoost classifier
    xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=num_class, random_state=100)

    # Perform cross-validation
    cv_scores = cross_val_score(xgb_classifier, x_train, y_train, cv=10, scoring='accuracy')

    # Train the model
    xgb_classifier.fit(x_train, y_train)

    return xgb_classifier, cv_scores.mean()

### Predict Function

In [8]:
def prediction(classifier, x_test, y_test):
    y_pred = classifier.predict(x_test)

    # Evaluate the model
    class_report = classification_report(y_test, y_pred)

    return y_pred, class_report

# Five class Classification

In [9]:
import time

# Read dataset 
humor_5class_path = "datasets/Humour_style.xlsx" 
x_train_5, x_test_5, y_train_5, y_test_5 = read_dataset(humor_5class_path)

In [10]:
# BGE Embedding Model
start_time_bge = time.time()
x_train_5_bge, x_test_5_bge = BGE(x_train_5), BGE(x_test_5)
execution_time_bge = time.time() - start_time_bge
print(f"Time in seconds BGE: {execution_time_bge:.6f}")

# GTE Embedding Model
start_time_gte = time.time()
x_train_5_gte, x_test_5_gte = GTE(x_train_5), GTE(x_test_5)
execution_time_gte = time.time() - start_time_gte
print(f"Time in seconds GTE: {execution_time_gte:.6f}")

Time in seconds BGE: 222.633631
Time in seconds GTE: 202.641098


In [11]:
# Random Forest
RF_bge_classifier5, RF_bge_cv_scores5 = randomforest(x_train_5_bge,y_train_5)
RF_bge_y_pred5, RF_bge_class_report5  = prediction(RF_bge_classifier5,x_test_5_bge, y_test_5)

RF_gteclassifier5, RF_gte_cv_scores5 = randomforest(x_train_5_gte, y_train_5)
RF_gte_y_pred5, RF_gte_class_report5  = prediction(RF_gteclassifier5, x_test_5_gte, y_test_5)

# Save Report
RF_bge_result_5= save_results(y_test_5,RF_bge_y_pred5)
RF_bge_result_5.to_csv('models_results/RF_multilingual_5classes.csv', index=False)

RF_gte_result_5 = save_results(y_test_5,RF_gte_y_pred5)
RF_gte_result_5.to_csv('models_results/RF_UAE_5classes.csv', index=False)

# Evaluation Report
print(f'RF BGE: Cross Val {RF_bge_cv_scores5}\n {RF_bge_class_report5}')
print(f'RF GTE: Cross Val {RF_gte_cv_scores5}\n {RF_gte_class_report5}')

RF BGE: Cross Val 0.805940594059406
               precision    recall  f1-score   support

           0       0.77      0.93      0.85        59
           1       0.80      0.81      0.80        48
           2       0.64      0.36      0.46        44
           3       0.67      0.72      0.69        46
           4       0.95      1.00      0.97        56

    accuracy                           0.79       253
   macro avg       0.77      0.77      0.76       253
weighted avg       0.78      0.79      0.77       253

RF GTE: Cross Val 0.7752475247524753
               precision    recall  f1-score   support

           0       0.88      0.90      0.89        59
           1       0.74      0.71      0.72        48
           2       0.61      0.32      0.42        44
           3       0.59      0.78      0.67        46
           4       0.87      0.98      0.92        56

    accuracy                           0.76       253
   macro avg       0.74      0.74      0.73       253
we

In [12]:
# XGBOOST
xg_bge_classifier5, xg_bge_cv_scores5 = XGBoost(x_train_5_bge,y_train_5,5)
xg_bge_y_pred5, xg_bge_class_report5  = prediction(xg_bge_classifier5, x_test_5_bge, y_test_5)

xg_gte_classifier5, xg_gte_cv_scores5 = XGBoost(x_train_5_gte,y_train_5,5)
xg_gte_y_pred5, xg_gte_class_report5  = prediction(xg_gte_classifier5, x_test_5_gte, y_test_5)

# Save Report
xg_bge_result_5= save_results(y_test_5, xg_bge_y_pred5)
xg_bge_result_5.to_csv('models_results/XG_multilingual_5classes.csv', index=False)

xg_gte_result_5 = save_results(y_test_5, xg_gte_y_pred5)
xg_gte_result_5.to_csv('models_results/XG_UAE_5classes.csv', index=False)

# Evaluation Report
print(f'XGBoost BGE: Cross Val {xg_bge_cv_scores5}\n {xg_bge_class_report5}')
print(f'XGBoost GTE: Cross Val {xg_gte_cv_scores5}\n {xg_gte_class_report5}')

XGBoost BGE: Cross Val 0.8297029702970298
               precision    recall  f1-score   support

           0       0.79      0.93      0.85        59
           1       0.75      0.79      0.77        48
           2       0.65      0.45      0.53        44
           3       0.65      0.61      0.63        46
           4       0.97      1.00      0.98        56

    accuracy                           0.78       253
   macro avg       0.76      0.76      0.75       253
weighted avg       0.77      0.78      0.77       253

XGBoost GTE: Cross Val 0.7782178217821782
               precision    recall  f1-score   support

           0       0.81      0.88      0.85        59
           1       0.79      0.77      0.78        48
           2       0.60      0.48      0.53        44
           3       0.67      0.63      0.65        46
           4       0.83      0.95      0.88        56

    accuracy                           0.76       253
   macro avg       0.74      0.74      0.74  

# Four Classes

In [13]:
import time

# Read dataset 
humor_4class_path = "datasets/Humour_style_4classes.xlsx" 
x_train_4, x_test_4, y_train_4, y_test_4 = read_dataset(humor_4class_path)
# BGE Embedding Model
start_time_bge4 = time.time()
x_train_4_bge, x_test_4_bge = BGE(x_train_4), BGE(x_test_4)
execution_time_bge4 = time.time() - start_time_bge4
print(f"Time in seconds BGE: {execution_time_bge4:.6f}")

# GTE Embedding Model
start_time_gte4 = time.time()
x_train_4_gte, x_test_4_gte = GTE(x_train_4), GTE(x_test_4)
execution_time_gte4 = time.time() - start_time_gte4
print(f"Time in seconds GTE: {execution_time_gte4:.6f}")

Time in seconds BGE: 221.041124
Time in seconds GTE: 207.955368


In [14]:
# Random Forest
RF_bge_classifier4, RF_bge_cv_scores4 = randomforest(x_train_4_bge, y_train_4)
RF_bge_y_pred4, RF_bge_class_report4  = prediction(RF_bge_classifier4, x_test_4_bge, y_test_4)

RF_gte_classifier4, RF_gte_cv_scores4 = randomforest(x_train_4_gte, y_train_4)
RF_gte_y_pred4, RF_gte_class_report4  = prediction(RF_gte_classifier4, x_test_4_gte, y_test_4)

# Save Report
RF_bge_result_4= save_results(y_test_4,RF_bge_y_pred4)
RF_bge_result_4.to_csv('models_results/RF_multilingual_4classes.csv', index=False)

RF_gte_result_4 = save_results(y_test_4,RF_gte_y_pred4)
RF_gte_result_4.to_csv('models_results/RF_UAE_4classes.csv', index=False)

# Evaluation Report
print(f'RF BGE: Cross Val {RF_bge_cv_scores4}\n {RF_bge_class_report4}')
print(f'RF GTE: Cross Val {RF_gte_cv_scores4}\n {RF_gte_class_report4}')

RF BGE: Cross Val 0.8386138613861386
               precision    recall  f1-score   support

           0       0.90      0.88      0.89        59
           1       0.81      0.44      0.57        48
           2       0.72      0.92      0.81        90
           3       1.00      0.96      0.98        56

    accuracy                           0.83       253
   macro avg       0.86      0.80      0.81       253
weighted avg       0.84      0.83      0.82       253

RF GTE: Cross Val 0.7930693069306931
               precision    recall  f1-score   support

           0       0.88      0.73      0.80        59
           1       0.87      0.56      0.68        48
           2       0.70      0.96      0.81        90
           3       0.98      0.88      0.92        56

    accuracy                           0.81       253
   macro avg       0.86      0.78      0.80       253
weighted avg       0.84      0.81      0.81       253



In [15]:
# XGBOOST
xg_bge_classifier4, xg_bge_cv_scores4 = XGBoost(x_train_4_bge, y_train_4, 4)
xg_bge_y_pred4,  xg_bge_class_report4 = prediction(xg_bge_classifier4, x_test_4_bge, y_test_4)

xg_gte_classifier4, xg_gte_cv_scores4 = XGBoost(x_train_4_gte, y_train_4, 4)
xg_gte_y_pred4, xg_gte_class_report4  = prediction(xg_gte_classifier4, x_test_4_gte, y_test_4)


# Save Report
xg_bge_result_4= save_results(y_test_4, xg_bge_y_pred4)
xg_bge_result_4.to_csv('models_results/XG_multilingual_4classes.csv', index=False)

xg_gte_result_4 = save_results(y_test_4, xg_gte_y_pred4)
xg_gte_result_4.to_csv('models_results/XG_UAE_4classes.csv', index=False)

# Evaluation Report
print(f'XGBoost BGE: Cross Val {xg_bge_cv_scores4}\n {xg_bge_class_report4}')
print(f'XGBoost GTE: Cross Val {xg_gte_cv_scores4}\n {xg_gte_class_report4}')

XGBoost BGE: Cross Val 0.8792079207920793
               precision    recall  f1-score   support

           0       0.82      0.90      0.85        59
           1       0.85      0.71      0.77        48
           2       0.88      0.90      0.89        90
           3       1.00      1.00      1.00        56

    accuracy                           0.89       253
   macro avg       0.89      0.88      0.88       253
weighted avg       0.89      0.89      0.88       253

XGBoost GTE: Cross Val 0.8356435643564357
               precision    recall  f1-score   support

           0       0.86      0.86      0.86        59
           1       0.89      0.69      0.78        48
           2       0.78      0.89      0.83        90
           3       0.95      0.93      0.94        56

    accuracy                           0.85       253
   macro avg       0.87      0.84      0.85       253
weighted avg       0.86      0.85      0.85       253



# Two Class Classification

In [16]:
import time

# Read dataset 
train_2class_path = "datasets/af_ag_train.xlsx" 
test_2class_path  = "datasets/af_ag_test.xlsx" 

x_train_2, x_test_2, y_train_2, y_test_2 = read_train_test_dataset(train_2class_path, test_2class_path)

In [17]:
# BGE Embedding Model
start_time_bge2 = time.time()
x_train_2_bge, x_test_2_bge = BGE(x_train_2), BGE(x_test_2)
execution_time_bge2 = time.time() - start_time_bge2
print(f"Time in seconds BGE: {execution_time_bge2:.6f}")

# GTE Embedding Model
start_time_gte2 = time.time()
x_train_2_gte, x_test_2_gte = GTE(x_train_2), GTE(x_test_2)
execution_time_gte2= time.time() - start_time_gte2
print(f"Time in seconds GTE: {execution_time_gte2:.6f}")

Time in seconds BGE: 125.130854
Time in seconds GTE: 109.577378


In [18]:
# Random Forest
RF_bge_classifier2, RF_bge_cv_scores2 = randomforest(x_train_2_bge,y_train_2)
RF_bge_y_pred2, RF_bge_class_report2  = prediction(RF_bge_classifier2, x_test_2_bge,y_test_2)


RF_gte_classifier2, RF_gte_cv_scores2 = randomforest(x_train_2_gte,y_train_2)
RF_gte_y_pred2, RF_gte_class_report2  = prediction(RF_gte_classifier2,x_test_2_gte, y_test_2)

# Save Report
RF_bge_result_2= save_results(y_test_2,RF_bge_y_pred2)
RF_bge_result_2.to_csv('models_results/RF_multilingual_2classes.csv', index=False)

RF_gte_result_2 = save_results(y_test_2,RF_gte_y_pred2)
RF_gte_result_2.to_csv('models_results/RF_UAE_2classes.csv', index=False)

# Evaluation Report
print(f'RF BGE: Cross Val {RF_bge_cv_scores2}\n {RF_bge_class_report2}')
print(f'RF GTE: Cross Val {RF_gte_cv_scores2}\n {RF_gte_class_report2}')

RF BGE: Cross Val 0.8235492577597843
               precision    recall  f1-score   support

           0       0.77      0.77      0.77        44
           1       0.78      0.78      0.78        46

    accuracy                           0.78        90
   macro avg       0.78      0.78      0.78        90
weighted avg       0.78      0.78      0.78        90

RF GTE: Cross Val 0.8078272604588393
               precision    recall  f1-score   support

           0       0.79      0.61      0.69        44
           1       0.70      0.85      0.76        46

    accuracy                           0.73        90
   macro avg       0.75      0.73      0.73        90
weighted avg       0.74      0.73      0.73        90



In [19]:
# XGBOOST
xg_bge_classifier2, xg_bge_cv_scores2 = XGBoost(x_train_2_bge,y_train_2,2)
xg_bge_y_pred2, xg_bge_class_report2  = prediction(xg_bge_classifier2, x_test_2_bge, y_test_2)

xg_gte_classifier2, xg_gte_cv_scores2 = XGBoost(x_train_2_gte,y_train_2,2)
xg_gte_y_pred2, xg_gte_class_report2  = prediction(xg_gte_classifier2,x_test_2_gte, y_test_2)

# Save Report
xg_bge_result_2= save_results(y_test_2, xg_bge_y_pred2)
xg_bge_result_2.to_csv('models_results/XG_multilingual_2classes.csv', index=False)

xg_gte_result_2 = save_results(y_test_2, xg_gte_y_pred2)
xg_gte_result_2.to_csv('models_results/XG_UAE_2classes.csv', index=False)

# Evaluation Report
print(f'XGBoost BGE: Cross Val {xg_bge_cv_scores2}\n {xg_bge_class_report2}')
print(f'XGBoost GTE: Cross Val {xg_gte_cv_scores2}\n {xg_gte_class_report2}')

XGBoost BGE: Cross Val 0.7950742240215923
               precision    recall  f1-score   support

           0       0.78      0.70      0.74        44
           1       0.74      0.80      0.77        46

    accuracy                           0.76        90
   macro avg       0.76      0.75      0.75        90
weighted avg       0.76      0.76      0.75        90

XGBoost GTE: Cross Val 0.8133603238866396
               precision    recall  f1-score   support

           0       0.73      0.61      0.67        44
           1       0.68      0.78      0.73        46

    accuracy                           0.70        90
   macro avg       0.70      0.70      0.70        90
weighted avg       0.70      0.70      0.70        90



# Individual Joke Prediction

In [20]:
def single_predict(example, embed_model, class_model, num_label):
    # Embedding 
    embedding = np.empty(0)
    if embed_model == "bge":
        embedding = BGE(example)
    elif embed_model == "gte":
        embedding = GTE(example)
    
    embedding = np.expand_dims(embedding, axis=0)

    # Classifiers
    if num_label == 5:
        if class_model == "rf":
            pred = RF_bge_classifier5.predict(embedding)
        elif class_model == "xgb":
            pred = xg_gte_classifier5.predict(embedding)
    elif num_label == 4:
        if class_model == "rf":
            pred = RF_bge_classifier4.predict(embedding)
        elif class_model == "xgb":
            pred = xg_gte_classifier4.predict(embedding)
    elif num_label == 2:
        if class_model == "rf":
            pred = RF_bge_classifier2.predict(embedding)
        elif class_model == "xgb":
            pred = xg_gte_classifier2.predict(embedding)
    
    return pred


In [21]:
user_input = "What’s the smartest insect? A spelling bee!"
predict_5 = single_predict(user_input, "gte", "xgb", 5)
predict_4 = single_predict(user_input, "gte", "xgb", 4)

predict_2 = "None"
if int(predict_4) == 2:
    predict_2 = single_predict(user_input, "gte", "xgb", 2) #2 class model

# Print the predicted label
print(f'User Input: "{user_input}\n 5 class Model prediction : {predict_5}\n 4 class Model prediction : {predict_4}')
print(f' 2 class Model prediction : {predict_2}; (Where 0-Affiliative, 1-Aggressive)')


User Input: "What’s the smartest insect? A spelling bee!
 5 class Model prediction : [2]
 4 class Model prediction : [2]
 2 class Model prediction : [0]; (Where 0-Affiliative, 1-Aggressive)
