# Import Libraries

In [49]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from data_preprocessing import lemmatize_text_with_pos, tokens,lemmatize_spacy
from sklearn.metrics import classification_report

# Functions/Methods

#### Dataset Function

In [50]:
# Read a single Dataset File
def read_dataset(file_path):
    if file_path.lower().endswith('.csv'):
        dataset = pd.read_csv(file_path)
    elif file_path.lower().endswith('.xlsx'):
        dataset = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a .csv or .xlsx file.")
    
    dataset   = np.array(dataset)
    data_train, data_test     = train_test_split(dataset, test_size=0.2, random_state=100)

    x_train, y_train   = (data_train[:,:-1]), (data_train[:,-1]).astype("int32")
    x_test, y_test     = (data_test[:,:-1]), (data_test[:,-1]).astype("int32")           
    x_train, x_test    = x_train.squeeze(), x_test.squeeze()

    return x_train, x_test, y_train, y_test


# If you have Train and Test Datasets separate
def read_train_test_dataset(train_data, test_data):
    if train_data.lower().endswith('.csv') and test_data.lower().endswith('.csv'):
        train_data = pd.read_csv(train_data)
        test_data = pd.read_csv(test_data)
    elif train_data.lower().endswith('.xlsx') and test_data.lower().endswith('.xlsx'):
        train_data = pd.read_excel(train_data)
        test_data = pd.read_excel(test_data)
    else:
        raise ValueError("Unsupported file format. Please provide a .csv or .xlsx file.")
    
    train_data, test_data   = np.array(train_data), np.array(test_data)

    x_train, y_train   = (train_data[:,:-1]), (train_data[:,-1]).astype("int32")
    x_test, y_test     = (test_data[:,:-1]), (test_data[:,-1]).astype("int32")           
    x_train, x_test    = x_train.squeeze(), x_test.squeeze()

    return x_train, x_test, y_train, y_test

# Example usage:
#file_path = "datasets/Humour_style.xlsx" 
#train_path = "datasets/af_ag_train.xlsx" 
#test_path  =  "datasets/af_ag_test.xlsx" 
#print(read_dataset(file_path))
#print(read_train_test_dataset(train_path,test_path))

#### Vocabulary Function

In [46]:
def vocab(x):
    all_word = set()
    for i in x:
        all_word.update(tokens(i))
    return all_word

# Example usage
#print(len(vocab(x_train)))

#### Training Function

In [51]:
def naive_bayes_multi(x, y, smoothing=1):
    classes = np.unique(y)
    vocabulary = vocab(x)
    N_doc = x.shape[0]

    log_probs = {}
    ex_dics = {}
    prob_words = {}

    for class_label in classes:
        N_cat = sum(y == class_label)
        examples = " ".join(x[y == class_label])

        log_prob = np.log(N_cat / N_doc)
        log_probs[class_label] = log_prob

        ex_dic = {}
        prob_word = {}

        for word in vocabulary:
            escaped_word = re.escape(word)

            word_count = len(re.findall(escaped_word, examples))
            ex_dic[word] = word_count

            prob_word[word] = np.round(np.log((word_count + smoothing) / (len(tokens(examples)) + len(vocabulary))), 5)

        ex_dics[class_label] = ex_dic
        prob_words[class_label] = prob_word

    return log_probs, ex_dics, prob_words

# Example usage
#log_probs, ex_dics, prob_words = naive_bayes_multi(x_train,y_train)

#### Prediction Function

In [52]:
def predict_naive_bayes(examples, log_probs, prob_words):
    predictions = []

    for example in examples:
        # Tokenize the example
        example_tokens = tokens(example)

        # Calculate the log likelihoods for each class
        class_likelihoods = {}
        for class_label, log_prob in log_probs.items():
            class_likelihood = log_prob + sum(prob_words[class_label].get(word, 0) for word in example_tokens)
            class_likelihoods[class_label] = class_likelihood

        # Make a prediction based on the class with the highest likelihood
        prediction = max(class_likelihoods, key=class_likelihoods.get)
        predictions.append(prediction)

    return predictions


#example usage
#log_probs, ex_dics, prob_words = naive_bayes_multi(x_train, y_train)
#predicted_labels = predict_naive_bayes(x_test, log_probs, prob_words)

# Evaluation Report
#report = classification_report(y_test,predicted_labels)
#print(report)

#### Save Reports Function

In [53]:
def save_results(true_label, predicted):
    report_dict = classification_report(true_label,predicted,output_dict=True)

    # Save Result Report
    save_report = pd.DataFrame(report_dict).transpose()  # Convert the report dictionary to a DataFrame
    save_report = save_report.round(3)                   # Round the values to a specific number of decimal places
    save_report = save_report.astype({'support': int})   # Convert the 'support' column to integers
    save_report.loc['accuracy', ['precision', 'recall', 'support']] = [None, None, None] # Set the accuracy row to None

    return save_report

# Example Usage 
#save_report = save_results(y_test,predicted_labels)
#save_report.to_csv('models_results/Naive_Bayes_5classes.csv', index=False)  # Save the DataFrame to a CSV file

# Five Class Classification

In [54]:
# Read dataset 
humor_5class_path = "datasets/Humour_style.xlsx" 
x_train_5, x_test_5, y_train_5, y_test_5 = read_dataset(humor_5class_path)

# Lemmmatize (Spacy)
x_train_5 = [' '.join(lemmatize_spacy(example)) for example in x_train_5]
x_test_5  = [' '.join(lemmatize_spacy(example)) for example in x_test_5]

# Convert to numpy array
x_train_5, x_test_5 = np.array(x_train_5), np.array(x_test_5)

# Train Naive Bayes Model
log_probs_5, ex_dics_5, prob_words_5 = naive_bayes_multi(x_train_5,y_train_5)

# Test/Predict using Naive Bayes Model
predicted_labels_5 = predict_naive_bayes(x_test_5, log_probs_5, prob_words_5)

# Save Report
result_5= save_results(y_test_5,predicted_labels_5)
result_5.to_csv('models_results/Naive_Bayes_5classes.csv', index=False)

# Evaluation Report
print(classification_report(y_test_5,predicted_labels_5))

              precision    recall  f1-score   support

           0       0.69      0.85      0.76        59
           1       0.66      0.65      0.65        48
           2       0.60      0.41      0.49        44
           3       0.57      0.74      0.64        46
           4       1.00      0.79      0.88        56

    accuracy                           0.70       253
   macro avg       0.70      0.69      0.68       253
weighted avg       0.72      0.70      0.70       253



# Four Class Classification

In [56]:
# Read dataset 
humor_4class_path = "datasets/Humour_style_4classes.xlsx" 
x_train_4, x_test_4, y_train_4, y_test_4 = read_dataset(humor_4class_path)

# Lemmmatize (Spacy)
x_train_4 = [' '.join(lemmatize_spacy(example)) for example in x_train_4]
x_test_4  = [' '.join(lemmatize_spacy(example)) for example in x_test_4]

# Convert to numpy array
x_train_4, x_test_4 = np.array(x_train_4), np.array(x_test_4)

# Train Naive Bayes Model
log_probs_4, ex_dics_4, prob_words_4 = naive_bayes_multi(x_train_4,y_train_4)

# Test/Predict using Naive Bayes Model
predicted_labels_4 = predict_naive_bayes(x_test_4, log_probs_4, prob_words_4)

# Save Report
result_4= save_results(y_test_4,predicted_labels_4)
result_4.to_csv('models_results/Naive_Bayes_4classes.csv', index=False)

# Evaluation Report
print(classification_report(y_test_4,predicted_labels_4))

              precision    recall  f1-score   support

           0       0.76      0.58      0.65        59
           1       0.74      0.54      0.63        48
           2       0.62      0.91      0.74        90
           3       1.00      0.71      0.83        56

    accuracy                           0.72       253
   macro avg       0.78      0.69      0.71       253
weighted avg       0.76      0.72      0.72       253



# Two Class Classification

In [57]:
# Read dataset 
train_2class_path = "datasets/af_ag_train.xlsx" 
test_2class_path  = "datasets/af_ag_test.xlsx" 

x_train_2, x_test_2, y_train_2, y_test_2 = read_train_test_dataset(train_2class_path, test_2class_path)

# Lemmmatize (Spacy)
x_train_2 = [' '.join(lemmatize_spacy(example)) for example in x_train_2]
x_test_2  = [' '.join(lemmatize_spacy(example)) for example in x_test_2]

# Convert to numpy array
x_train_2, x_test_2 = np.array(x_train_2), np.array(x_test_2)

# Train Naive Bayes Model
log_probs_2, ex_dics_2, prob_words_2 = naive_bayes_multi(x_train_2,y_train_2)

# Test/Predict using Naive Bayes Model
predicted_labels_2 = predict_naive_bayes(x_test_2, log_probs_2, prob_words_2)

# Save Report
result_2= save_results(y_test_2,predicted_labels_2)
result_2.to_csv('models_results/Naive_Bayes_2classes.csv', index=False)

# Evaluation Report
print(classification_report(y_test_2,predicted_labels_2))

              precision    recall  f1-score   support

           0       0.83      0.66      0.73        44
           1       0.73      0.87      0.79        46

    accuracy                           0.77        90
   macro avg       0.78      0.76      0.76        90
weighted avg       0.78      0.77      0.76        90



# Individual Jokes Predictions

In [58]:
# Function
def predict_naive_bayes_single(example, log_probs, prob_words):
    # Tokenize the example
    example_tokens = tokens(example)

    # Calculate the log likelihoods for each class
    class_likelihoods = {}
    for class_label, log_prob in log_probs.items():
        class_likelihood = log_prob + sum(prob_words[class_label].get(word, 0) for word in example_tokens)
        class_likelihoods[class_label] = class_likelihood

    # Make a prediction based on the class with the highest likelihood
    prediction = max(class_likelihoods, key=class_likelihoods.get)

    return prediction


#### Example usage of Single prediction function

In [85]:
# User input
#user_input = "I am living my best live in a cage of sadness and loneliness"
user_input = "What’s the smartest insect? A spelling bee!"

# Preprocess the user input
preprocessed_input = lemmatize_spacy(user_input)  # Replace with your actual preprocessing function
preprocessed_input = ", ".join(preprocessed_input)

# Make predictions
user_prediction_5 = predict_naive_bayes_single(preprocessed_input, log_probs_5, prob_words_5) # class model
user_prediction_4 = predict_naive_bayes_single(preprocessed_input, log_probs_4, prob_words_4) #4 class model

user_prediction_2 = "None"
if int(user_prediction_4) == 2:
    user_prediction_2 = predict_naive_bayes_single(preprocessed_input, log_probs_2, prob_words_2) #2 class model

# Print the predicted label
print(f'User Input: "{user_input}\n 5 class Model prediction : {user_prediction_5}\n 4 class Model prediction : {user_prediction_4}')
print(f' 2 class Model prediction : {user_prediction_2}; (Where 0-Affiliative, 1-Aggressive)')



User Input: "What’s the smartest insect? A spelling bee!
 5 class Model prediction : 0
 4 class Model prediction : 2
 2 class Model prediction : 0; (Where 0-Affiliative, 1-Aggressive)
