In [1]:
import sys
sys.path.append('models')

# from google.colab import drive
# drive.mount('/content/drive')
# sys.path.append('/content/drive/MyDrive/Dissertation/models')
# sys.path.append('/content/drive/MyDrive/Dissertation')

In [2]:
# !pip install dask_ml
# !pip install scikeras
# import nltk
# nltk.download('stopwords')
# import nltk
# nltk.download('punkt')

In [3]:
# importing libraries
import joblib
import os
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from collections import defaultdict
import matplotlib.pyplot as plt

from logstic_regression import Logistic_Regression
from XGBoost import XGBoost
from naive_bayes import Naive_Bayes
from rnn import RNN
from cnn import CNN
# from models.bert import BERT
from bilstm import BiLSTM

# Load the TextPreprocessor class (assumed to be defined already)
from textpreprocessor import TextPreprocessor

import warnings
warnings.filterwarnings("ignore")

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

2024-10-23 13:32:33.608474: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-23 13:32:33.616999: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-23 13:32:33.619508: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-23 13:32:33.626026: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available:  1


I0000 00:00:1729683155.133688  355331 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1729683155.137303  355331 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1729683155.137342  355331 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


In [4]:
NUM_SAMPLE = 10000
TEST_RATIO=0.2
BATCH_SIZE=32
EPOCHS = 5
MAX_WORD_COUNT = 5000
MAX_LENGTH = 100
OUTPUT_RESULT_DIR = "Output/result"
OUTPUT_MODELS_DIR = "Output/models"
USE_TEST_DATA = True

os.makedirs(OUTPUT_RESULT_DIR, exist_ok=True)
os.makedirs(OUTPUT_MODELS_DIR, exist_ok=True)

In [5]:
# Define a function to plot training history
def plot_training_history(history, title="Model Training History"):
    # Extract values from history
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1, len(acc) + 1)

    # Plot training and validation accuracy
    plt.figure(figsize=(14, 5))

    # Accuracy plot
    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc, 'b', label='Training Accuracy')
    plt.plot(epochs, val_acc, 'r', label='Validation Accuracy')
    plt.title(f"{title} - Accuracy")
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Loss plot
    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, 'b', label='Training Loss')
    plt.plot(epochs, val_loss, 'r', label='Validation Loss')
    plt.title(f"{title} - Loss")
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

# Function to plot training history from defaultdict data
def plot_training_history_from_dict(history, title="Model Training History"):
    # Extract values from the dictionary
    acc = history['accuracy']
    val_acc = history['val_accuracy']
    loss = history['loss']
    val_loss = history['val_loss']

    # Set up epoch range
    epochs = range(1, len(acc) + 1)

    # Plot training and validation accuracy
    plt.figure(figsize=(14, 5))

    # Accuracy plot
    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc, 'b', label='Training Accuracy')
    plt.plot(epochs, val_acc, 'r', label='Validation Accuracy')
    plt.title(f"{title} - Accuracy")
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Loss plot
    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, 'b', label='Training Loss')
    plt.plot(epochs, val_loss, 'r', label='Validation Loss')
    plt.title(f"{title} - Loss")
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

# Dictionary to store the results
results = {
    'Model': [],
    'Training-Time':[],
    'Accuracy': [],
    'Precision (Class 0)': [],
    'Precision (Class 1)': [],
    'Recall (Class 0)': [],
    'Recall (Class 1)': [],
    'F1-Score (Class 0)': [],
    'F1-Score (Class 1)': []
}

# Function to calculate accuracy and classification report
def _evaluate_model(training_time, model_name, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred) * 100
    report = classification_report(y_test, y_pred, output_dict=True)

    # Store the results
    results['Model'].append(model_name)
    results['Training-Time'].append(training_time)
    results['Accuracy'].append(accuracy)
    results['Precision (Class 0)'].append(report['0']['precision'])
    results['Precision (Class 1)'].append(report['1']['precision'])
    results['Recall (Class 0)'].append(report['0']['recall'])
    results['Recall (Class 1)'].append(report['1']['recall'])
    results['F1-Score (Class 0)'].append(report['0']['f1-score'])
    results['F1-Score (Class 1)'].append(report['1']['f1-score'])

def _predict_model(model,X):
    y_pred_prob = model.predict(X)
    return [1 if prob > 0.5 else 0 for prob in y_pred_prob]

def evaluate_model_class(model_class, X_test, y_test):
    y_pred = _predict_model(model_class.model, X_test)
    y_pred_random = _predict_model(model_class.random_search_cv.best_estimator_, X_test)
    y_pred_grid = _predict_model(model_class.grid_search_cv.best_estimator_, X_test)
    y_pred_best = _predict_model(model_class.best_model, X_test)

    _evaluate_model(model_class.training_time, model_class.model_name, y_test, y_pred)
    _evaluate_model(model_class.random_search_time,  model_class.model_name + '_random_search', y_test, y_pred_random)
    _evaluate_model(model_class.grid_search_time,  model_class.model_name + '_grid_search', y_test, y_pred_grid)
    _evaluate_model(model_class.best_training_time,  model_class.model_name + '_best', y_test, y_pred_best)

    df_results = pd.DataFrame(results)
    df_results.to_excel(os.path.join(OUTPUT_RESULT_DIR,'Model_Compare.xlsx'))

def evaluate_xgboost_model_class(model_class, X_test, y_test):
    y_pred_xgb = _predict_model(model_class.model, model_class.convert_to_dmatrix(X_test, y_test))
    y_pred_xgb_random = _predict_model(model_class.random_search_cv.best_estimator_, X_test.toarray())
    y_pred_xgb_grid = _predict_model(model_class.grid_search_cv.best_estimator_, X_test.toarray())
    y_pred_xgb_best = _predict_model(model_class.best_model, model_class.convert_to_dmatrix(X_test, y_test))

    # Call the function with your actual predictions (replace placeholders with your data)
    _evaluate_model(model_class.training_time, model_class.model_name, y_test, y_pred_xgb)
    _evaluate_model(model_class.random_search_time,  model_class.model_name + '_random_search', y_test, y_pred_xgb_random)
    _evaluate_model(model_class.grid_search_time,  model_class.model_name + '_grid_search', y_test, y_pred_xgb_grid)
    _evaluate_model(model_class.best_training_time,  model_class.model_name + '_best', y_test, y_pred_xgb_best)

    df_results = pd.DataFrame(results)
    df_results.to_excel(os.path.join(OUTPUT_RESULT_DIR,'Model_Compare.xlsx'), index=False)

def compare_models_accuracy_and_get_best_params(models, X_test, y_test):
    best_accuracy = 0
    best_params = None
    best_model_name = None

    for model_name, model_class in models.items():
        # Get model's parameters (either from random search CV or from original model)
        if model_name!='Original':  # Check if it has random_search_cv
            y_pred_prob = model_class.predict(X_test)
            params = model_class.best_params_
        else:
            y_pred_prob = model_class.predict(X_test)
            params = model_class.get_params()

        # Convert probabilities to binary predictions
        pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

        # Calculate accuracy
        accuracy = accuracy_score(y_test, pred) * 100
        print(f'{model_name} Accuracy: {accuracy}')

        # Compare and keep track of the model with the highest accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params
            best_model_name = model_name

    print(f'Best Model: {best_model_name} with Accuracy: {best_accuracy}')
    print(f'Best Parameters: {best_params}')
    return best_model_name, best_params


00. Text Pre-Processing

In [6]:
# Initialize the Text Preprocessor
processor = TextPreprocessor(MAX_WORD_COUNT, MAX_LENGTH)

if USE_TEST_DATA:
    INPUT_DIR = f"Output/proto_models_rev2_{NUM_SAMPLE}"
    # Load data
    df_train = pd.read_csv(os.path.join(INPUT_DIR, 'train_cleaned.csv'))
    df_test = pd.read_csv(os.path.join(INPUT_DIR, 'test_cleaned.csv'))
    # df_test = processor.load_data()
    X_train = df_train['review']
    X_test = df_test['review']
    y_train = df_train['polarity']
    y_test = df_test['polarity']
    # X_train_seq_padded = pickle.load(os.path.join(INPUT_DIR, 'X_train_pad.pkl'))
    # X_test_seq_padded = pickle.load(os.path.join(INPUT_DIR, 'X_test_pad.pkl'))
else:
    # Load data
    df_train, df_test = processor.parallel_load_data()

    df_train_step1 = processor.remove_stopwords(df_train.copy())
    df_test_step1 = processor.remove_stopwords(df_test.copy())

    print('----------TRAIN DATA----------')
    df_train_step2 = processor.filter_by_length_of_sentence(df_train_step1.copy(),50)
    print('----------TEST DATA----------')
    df_test_step2 = processor.filter_by_length_of_sentence(df_test_step1.copy(),50)

    df_train_step3 = processor.sampling_data(df_train_step2, NUM_SAMPLE)
    df_test_step3 = processor.sampling_data(df_test_step2, int(NUM_SAMPLE*TEST_RATIO))

    # Preprocess data
    df_train_step3 = processor.map_polarity(df_train_step3.copy())
    df_test_step3 = processor.map_polarity(df_test_step3.copy())

    # Split data
    X_train, y_train = processor.split_data(df_train_step3)
    X_test, y_test = processor.split_data(df_test_step3)
    
    INPUT_DIR = f"Output/proto_models_rev2_{NUM_SAMPLE}"
    os.makedirs(INPUT_DIR, exist_ok=True)
    # Save data
    df_train_step3.to_csv(os.path.join(INPUT_DIR, 'train_cleaned.csv'), index=False)
    df_test_step3.to_csv(os.path.join(INPUT_DIR, 'test_cleaned.csv'), index=False)

X_train_tfidf, X_test_tfidf = processor.vectorize_text(X_train, X_test)
X_train_pad, X_test_pad = processor.tokenization_and_padding(X_train, X_test)

MAX_LENGTH = processor.max_length

01. Logistic Regression

>Original Accuracy: 85.42
<br>
>RandomizedSearchCV Accuracy: 85.42
<br>
>ElasticNet Accuracy: 85.45
<br>
>Best Model: ElasticNet with Accuracy: 85.45
<br>
Best Parameters: {'tol': 0.01, 'solver': 'saga', 'penalty': 'elasticnet', 'max_iter': 100, 'l1_ratio': 0.1, 'class_weight': None, 'C': 1.291549665014884}

In [7]:
# 1. Train Model
logistic_regression = Logistic_Regression(verbose=1)
logistic_regression.train_model(X_train_tfidf, y_train)

# 2. Random SearchCV
logistic_regression.random_search(X_train_tfidf, y_train, n_iter=1500, cv=10, random_state=42, n_jobs=-1)
logistic_regression.random_search_elasticnet(X_train_tfidf, y_train, n_iter=1500, cv=10, random_state=42, n_jobs=-1)

_, best_params = compare_models_accuracy_and_get_best_params({'Original': logistic_regression.model,
                                                              'RandomizedSearchCV': logistic_regression.random_search_cv,
                                                              'ElasticNet': logistic_regression.random_search_cv_elasticnet}, X_test_tfidf, y_test)

# 3. Grid SearchCV
logistic_regression.grid_search(X_train_tfidf, y_train, cv=10, n_jobs=-1,best_params=best_params)

# 4. Train Best Model
logistic_regression.train_best_model(X_train_tfidf, y_train, logistic_regression.grid_search_cv.best_params_)

# 5. Evaluate and Save Models
evaluate_model_class(logistic_regression,X_test_tfidf, y_test)

logistic_regression.save_model_and_params(
    os.path.join(OUTPUT_MODELS_DIR, 'logistic_model.pkl'),
    os.path.join(OUTPUT_MODELS_DIR, 'logistic_best_model.pkl'),
    os.path.join(OUTPUT_MODELS_DIR, 'logistic_best_params.pkl')
    )

Fitting 10 folds for each of 1500 candidates, totalling 15000 fits




02. XGBoost

In [None]:
# 1. Train Model
xgboost = XGBoost(verbose=1)
xgboost.train_model(X_train_tfidf, y_train)

X_train_tfidf = X_train_tfidf.astype(np.float32)

# 2. Random SearchCV
xgboost.random_search(X_train_tfidf, y_train, n_iter=2000, cv=10, random_state=42, n_jobs=-1)

_, best_params = compare_models_accuracy_and_get_best_params({'Original': xgboost.model,
                                                              'RandomizedSearchCV': xgboost.random_search_cv}, X_test_tfidf.toarray(), y_test)

# 3. Grid SearchCV
xgboost.grid_search(X_train_tfidf, y_train, cv=10,  n_jobs=-1, best_params=best_params)

# 4. Train Best Model
xgboost.train_best_model(X_train_tfidf, y_train, xgboost.grid_search_cv.best_params_)

# 5. Evaluate and Save Models
evaluate_model_class(xgboost, X_test_tfidf, y_test)

xgboost.save_model_and_params(
    os.path.join(OUTPUT_MODELS_DIR, 'xgboost_model.pkl'),
    os.path.join(OUTPUT_MODELS_DIR, 'xgboost_best_model.pkl'),
    os.path.join(OUTPUT_MODELS_DIR, 'xgboost_best_params.pkl')
    )

03. Naive Bayes

In [None]:
# 1. Train Model
naive_bayes = Naive_Bayes(verbose=1)
naive_bayes.train_model(X_train_tfidf, y_train)

# 2. Random SearchCV
# naive_bayes.random_search(X_train_tfidf, y_train, n_iter=30, cv=2, verbos=0, random_state=42, n_jobs=-1)
naive_bayes.random_search(X_train_tfidf, y_train, n_iter=5000, cv=10, random_state=42, n_jobs=-1)

_, best_params = compare_models_accuracy_and_get_best_params({'Original': naive_bayes.model,
                                                              'RandomizedSearchCV': naive_bayes.random_search_cv}, X_test_tfidf, y_test)

# 3. Grid SearchCV
# naive_bayes.grid_search(X_train_tfidf, y_train, naive_bayes.random_search_cv.best_params_, cv=2, verbos=1, n_jobs=-1)
naive_bayes.grid_search(X_train_tfidf, y_train, best_params=best_params, cv=10, n_jobs=-1)

# 4. Train Best Model
naive_bayes.train_best_model(X_train_tfidf, y_train, naive_bayes.grid_search_cv.best_params_)

# 5. Evaluate and Save Models
evaluate_model_class(naive_bayes,X_test_tfidf, y_test)

naive_bayes.save_model_and_params(
    os.path.join(OUTPUT_MODELS_DIR, 'naivebayes_model.pkl'),
    os.path.join(OUTPUT_MODELS_DIR, 'naivebayes_best_model.pkl'),
    os.path.join(OUTPUT_MODELS_DIR, 'naivebayes_best_params.pkl')
    )

04. Recurrent Neural Network

In [None]:
# 1. Train Model
rnn = RNN(max_feature=5000, max_length=100, epochs=10, batch_size=64, output_dim=128, optimizer='adam', embedding_dim=32, rnn_unit=64, verbose=1)
rnn.train_model(X_train_pad, y_train, validation_data=(X_test_pad, y_test))

# 2. Random SearchCV
rnn.random_search(X_train_pad, y_train, (X_test_pad, y_test), n_iter=3000, cv=10,  random_state=42, n_jobs=1)

_, best_params = compare_models_accuracy_and_get_best_params({'Original': rnn.model,
                                                              'RandomizedSearchCV': rnn.random_search_cv}, X_test_pad, y_test)

# 3. Grid SearchCV
rnn.grid_search(X_train_pad, y_train, (X_test_pad, y_test), cv=10, n_jobs=1, best_params=best_params)

# 4. Train Best Model
rnn.train_best_model(X_train_pad,y_train, validation_data=(X_test_pad, y_test), best_params= rnn.grid_search_cv.best_params_)

# 5. Evaluate and Save Models
evaluate_model_class(rnn,X_test_pad, y_test)

rnn.save_model_and_params(
    os.path.join(OUTPUT_MODELS_DIR, 'rnn_model.pkl'),
    os.path.join(OUTPUT_MODELS_DIR, 'rnn_best_model.pkl'),
    os.path.join(OUTPUT_MODELS_DIR, 'rnn_best_params.pkl')
    )

In [None]:
# Plot for hist
plot_training_history_from_dict(rnn.model.history_, title="Initial Model Training History")

# Plot the training history from defaultdict data
plot_training_history_from_dict(rnn.best_model.history_, title="Best Model Training History")

05. Convolutional Neural Network

In [None]:
# 1. Train Model
cnn = CNN(max_feature=processor.max_features, max_length=processor.max_length, epochs=EPOCHS, batch_size=BATCH_SIZE, output_dim=128, optimizer='adam', embedding_dim=32, verbose=1)
cnn.train_model(X_train_pad, y_train, (X_test_pad, y_test))

# 2. Random SearchCV
cnn.random_search(X_train_pad, y_train, (X_test_pad, y_test), n_iter=3000, cv=10, random_state=42, n_jobs=1, patience=3)

_, best_params = compare_models_accuracy_and_get_best_params({'Original': cnn.model,
                                                              'RandomizedSearchCV': cnn.random_search_cv}, X_test_pad, y_test)

# 3. Grid SearchCV
cnn.grid_search(X_train_pad, y_train, (X_test_pad, y_test), best_params=best_params, cv=10, n_jobs=1, patience=3)

# 4. Train Best Model
cnn.train_best_model(X_train_pad, y_train, (X_test_pad, y_test), best_params=cnn.grid_search_cv.best_params_)

# 5. Evaluate and Save Models
evaluate_model_class(cnn,X_test_pad, y_test)

cnn.save_model_and_params(
    os.path.join(OUTPUT_MODELS_DIR, 'cnn_model.pkl'),
    os.path.join(OUTPUT_MODELS_DIR, 'cnn_best_model.pkl'),
    os.path.join(OUTPUT_MODELS_DIR, 'cnn_best_params.pkl')
    )

In [None]:
# Plot for hist
plot_training_history_from_dict(cnn.model.history_, title="Initial Model Training History")

# Plot the training history from defaultdict data
plot_training_history_from_dict(cnn.best_model.history_, title="Best Model Training History")

06. Bidirectional Encoder Representations from Transformers(BERT)

In [None]:
# bert = BERT(max_length=processor.max_features, epochs=1, batch_size=BATCH_SIZE, verbose=1)
# bert.train_model(X_train, y_train, X_test, y_test)

# bert.random_search(X_train, y_train, X_test, y_test, max_trials=2, executions_per_trial=1, n_jobs=1)

# # Predict on test data
# y_pred_prob = bert_model.predict({'input_ids': bert.X_test_tokens['input_ids'], 'attention_mask': bert.X_test_tokens['attention_mask']}).logits
# y_pred_bert = np.argmax(y_pred_prob, axis=-1)

07. Bidirectional Long Short-Term Memory (BiLSTM)

In [None]:
# 1. Train Model
bilstm = BiLSTM(processor.tokenizer, EPOCHS, BATCH_SIZE, verbose=1)
bilstm.train_model(X_train_pad, y_train, X_test_pad,y_test)

# 2. Random SearchCV
bilstm.random_search(X_train_pad, y_train, X_test_pad,y_test, n_iter=3000, cv=10, random_state=42, n_jobs=1)

_, best_params = compare_models_accuracy_and_get_best_params({'Original': bilstm.model,
                                                              'RandomizedSearchCV': bilstm.random_search_cv}, X_test_pad, y_test)

# 3. Grid SearchCV
bilstm.grid_search(X_train_pad, y_train, X_test_pad, y_test, best_params= best_params, cv=10, n_jobs=1, patience=3)

# 4. Train Best Model
bilstm.train_best_model(X_train_pad, y_train, X_test_pad, y_test, best_params=bilstm.grid_search_cv.best_params_, patience=3)

# 5. Evaluate and Save Models
evaluate_model_class(bilstm,X_test_pad, y_test)

bilstm.save_model_and_params(
    os.path.join(OUTPUT_MODELS_DIR, 'bilstm_model.pkl'),
    os.path.join(OUTPUT_MODELS_DIR, 'bilstm_best_model.pkl'),
    os.path.join(OUTPUT_MODELS_DIR, 'bilstm_best_params.pkl')
    )

In [None]:
# Plot for hist
plot_training_history_from_dict(bilstm.model.history_, title="Initial Model Training History")

# Plot the training history from defaultdict data
plot_training_history_from_dict(bilstm.best_model.history_, title="Best Model Training History")

In [None]:
df_results = pd.DataFrame(results)