In [1]:
# importing libraries
import time
import joblib
import os
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf

from prototype.logstic_regression import Logistic_Regression
from prototype.XGBoost import XGBoost
from prototype.naive_bayes import Naive_Bayes
from prototype.rnn import RNN
from prototype.cnn import CNN
from prototype.bert import BERT
from prototype.bilstm import BiLSTM

# Load the TextPreprocessor class (assumed to be defined already)
from textpreprocessor import TextPreprocessor

import warnings
warnings.filterwarnings("ignore")

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

2024-09-23 16:27:36.153144: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-23 16:27:36.161595: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-23 16:27:36.164122: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-23 16:27:36.170646: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm
I00

In [2]:
NUM_SAMPLE = 10000
TEST_RATIO=0.2
BATCH_SIZE=32
EPOCHS = 5
MAX_WORD_COUNT = 5000
MAX_LENGTH = 100
OUTPUT_DIR = "Output/proto_models"

In [3]:
# Initialize the Text Preprocessor
processor = TextPreprocessor(MAX_WORD_COUNT, MAX_LENGTH)

# Load data
df_train, df_test = processor.load_data(num_sample=NUM_SAMPLE, test_ratio=TEST_RATIO)

# Preprocess data
df_train = processor.preprocess(df_train)
df_test = processor.preprocess(df_test)

# Split data
X_train, y_train = processor.split_data(df_train)
X_test, y_test = processor.split_data(df_test)

X_train_tfidf, X_test_tfidf = processor.vectorize_text(X_train, X_test)
X_train_pad, X_test_pad = processor.tokenization_and_padding(X_train, X_test)

Sampled Data Shape:
(10000, 3)
Sampled Data Shape:
(2000, 3)


01. Logistic Regression

In [4]:
logistic_regression = Logistic_Regression()
logistic_model = logistic_regression.train_model(X_train_tfidf, y_train)
y_pred_logistic = logistic_model.predict(X_test_tfidf)

02. XGBoost

In [5]:
xgboost = XGBoost()
xgb_model = xgboost.train_model(X_train_tfidf, y_train)

# Make predictions
y_pred_prob_xgb = xgb_model.predict(xgboost.convert_to_dmatrix(X_test_tfidf, y_test))
y_pred_xgb = [1 if prob > 0.5 else 0 for prob in y_pred_prob_xgb]

03. Naive Bayes

In [6]:
naive_bayes = Naive_Bayes()
nb_model = naive_bayes.train_model(X_train_tfidf, y_train)

# Making predictions
y_pred_nb = nb_model.predict(X_test_tfidf)

04. Recurrent Neural Network

In [7]:
rnn = RNN(processor.max_features, processor.max_length, EPOCHS, BATCH_SIZE)
rnn_model = rnn.train_model(X_train_pad, y_train, X_test_pad, y_test)

# Evaluate the model
y_pred_prob = rnn_model.predict(X_test_pad)
y_pred_rnn = (y_pred_prob > 0.5).astype("int32")

Epoch 1/5


I0000 00:00:1727101668.249242  274736 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1727101668.249338  274736 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1727101668.249359  274736 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1727101668.257179  274736 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1727101668.257281  274736 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-09-23

[1m  7/313[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 21ms/step - accuracy: 0.5324 - loss: 0.6915

I0000 00:00:1727101669.308641  275004 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - accuracy: 0.5833 - loss: 0.6512 - val_accuracy: 0.7875 - val_loss: 0.4625
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 27ms/step - accuracy: 0.8693 - loss: 0.3161 - val_accuracy: 0.7625 - val_loss: 0.5223
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step - accuracy: 0.9598 - loss: 0.1249 - val_accuracy: 0.7790 - val_loss: 0.5855
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step - accuracy: 0.9878 - loss: 0.0452 - val_accuracy: 0.7205 - val_loss: 0.8109
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 28ms/step - accuracy: 0.9901 - loss: 0.0299 - val_accuracy: 0.7310 - val_loss: 0.9411
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step


05. Convolutional Neural Network

In [8]:
cnn = CNN(processor.max_features, processor.max_length, EPOCHS, BATCH_SIZE)
cnn_model = cnn.train_model(X_train_pad, y_train, X_test_pad, y_test)

# Evaluate the model
y_pred_prob = rnn_model.predict(X_test_pad)
y_pred_cnn = (y_pred_prob > 0.5).astype("int32")

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6564 - loss: 0.6014 - val_accuracy: 0.8415 - val_loss: 0.3569
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9142 - loss: 0.2205 - val_accuracy: 0.8420 - val_loss: 0.3617
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9886 - loss: 0.0561 - val_accuracy: 0.8505 - val_loss: 0.4702
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9999 - loss: 0.0082 - val_accuracy: 0.8510 - val_loss: 0.5384
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0013 - val_accuracy: 0.8500 - val_loss: 0.5776
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step


06. Bidirectional Encoder Representations from Transformers(BERT)

In [9]:
bert = BERT(processor.max_features, processor.max_length, EPOCHS, BATCH_SIZE)
bert_model = bert.train_model(X_train, y_train, X_test, y_test)

# Predict on test data
y_pred_prob = bert_model.predict({'input_ids': bert.X_test_tokens['input_ids'], 'attention_mask': bert.X_test_tokens['attention_mask']}).logits
y_pred_bert = np.argmax(y_pred_prob, axis=-1)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


07. Bidirectional Long Short-Term Memory (BiLSTM)

In [10]:
bilstm = BiLSTM(processor.tokenizer, EPOCHS, BATCH_SIZE)
bilstm_model = bilstm.train_model(X_train_pad, y_train, X_test_pad,y_test)

# Evaluate the model
y_pred_prob = bilstm_model.predict(X_test_pad)
y_pred_bilstm = (y_pred_prob > 0.5).astype("int32")

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 19ms/step - accuracy: 0.6264 - loss: 0.6304 - val_accuracy: 0.8070 - val_loss: 0.4262
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.8621 - loss: 0.3271 - val_accuracy: 0.8330 - val_loss: 0.3909
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.9140 - loss: 0.2187 - val_accuracy: 0.8295 - val_loss: 0.4609
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.9451 - loss: 0.1458 - val_accuracy: 0.8180 - val_loss: 0.5280
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.9718 - loss: 0.0886 - val_accuracy: 0.8045 - val_loss: 0.6203
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [14]:
# Dictionary to store the results
results = {
    'Model': [],
    'Training-Time':[],
    'Accuracy': [],
    'Precision (Class 0)': [],
    'Precision (Class 1)': [],
    'Recall (Class 0)': [],
    'Recall (Class 1)': [],
    'F1-Score (Class 0)': [],
    'F1-Score (Class 1)': []
}

# Function to calculate accuracy and classification report
def evaluate_model(model_class, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred) * 100
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Store the results
    results['Model'].append(model_class.model_name)
    results['Training-Time'].append(model_class.training_time)
    results['Accuracy'].append(accuracy)
    results['Precision (Class 0)'].append(report['0']['precision'])
    results['Precision (Class 1)'].append(report['1']['precision'])
    results['Recall (Class 0)'].append(report['0']['recall'])
    results['Recall (Class 1)'].append(report['1']['recall'])
    results['F1-Score (Class 0)'].append(report['0']['f1-score'])
    results['F1-Score (Class 1)'].append(report['1']['f1-score'])

# Call the function with your actual predictions (replace placeholders with your data)
evaluate_model(logistic_regression, y_test, y_pred_logistic)
evaluate_model(xgboost, y_test, y_pred_xgb)
evaluate_model(naive_bayes, y_test, y_pred_nb)
evaluate_model(rnn, y_test, y_pred_rnn)
evaluate_model(cnn, y_test, y_pred_cnn)
evaluate_model(bert, y_test, y_pred_bert)
evaluate_model(bilstm, y_test, y_pred_bilstm)

# Create a DataFrame from the results
df_results = pd.DataFrame(results)

# Create the directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Save result to an Excel file
df_results.to_excel(os.path.join(OUTPUT_DIR, f'result(epoch{EPOCHS}_batch{BATCH_SIZE}_sample{NUM_SAMPLE}_ratio{TEST_RATIO}).xlsx'), index=False)

In [16]:
df_results

Unnamed: 0,Model,Training-Time,Accuracy,Precision (Class 0),Precision (Class 1),Recall (Class 0),Recall (Class 1),F1-Score (Class 0),F1-Score (Class 1)
0,Logistic_Regression,0.025306,84.35,0.837233,0.849558,0.843238,0.84375,0.840225,0.846644
1,XGBoost,1.120255,82.4,0.823651,0.824324,0.813525,0.833984,0.818557,0.829126
2,Naive_Bayes,0.00281,81.95,0.808425,0.830508,0.82582,0.813477,0.81703,0.821904
3,RNN,44.071815,73.1,0.75,0.716192,0.673156,0.786133,0.709503,0.749534
4,CNN,4.505993,73.1,0.75,0.716192,0.673156,0.786133,0.709503,0.749534
5,BERT,402.121258,92.05,0.904055,0.93731,0.936475,0.905273,0.91998,0.921013
6,BiLSTM,30.718415,80.45,0.803738,0.805207,0.793033,0.81543,0.79835,0.810286


In [15]:

# Save Models
joblib.dump(logistic_model, os.path.join(OUTPUT_DIR, 'logistic_regression_model.pkl'))
joblib.dump(xgb_model, os.path.join(OUTPUT_DIR, 'xgboost_model.pkl'))
joblib.dump(nb_model, os.path.join(OUTPUT_DIR, 'naive_bayes_model.pkl'))
rnn_model.save(os.path.join(OUTPUT_DIR, 'rnn_model.h5'))  # or .tf
cnn_model.save(os.path.join(OUTPUT_DIR, 'cnn_model.h5'))
bert_model.save(os.path.join(OUTPUT_DIR, 'bert_model'), save_format='tf')
bilstm_model.save(os.path.join(OUTPUT_DIR, 'bilstm_model.h5'))





INFO:tensorflow:Assets written to: Output/proto_models/bert_model/assets


INFO:tensorflow:Assets written to: Output/proto_models/bert_model/assets


In [17]:
joblib.dump(processor.tokenizer, os.path.join(OUTPUT_DIR, 'processor_tokenizer.pkl'))
joblib.dump(bert.X_train_tokens, os.path.join(OUTPUT_DIR, 'bert_X_train_tokens.pkl'))
joblib.dump(bert.X_test_tokens, os.path.join(OUTPUT_DIR, 'bert_X_test_tokens.pkl'))

['Output/proto_models/bert_X_test_tokens.pkl']