## Import libraries

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from transformers import AutoTokenizer

## Load the dataset

In [5]:
data_path = "cleaned_mbti_data.csv" 
data = pd.read_csv(data_path)

## Tokenization

In [6]:
vocab_size = 10000
maxlen = 1500
trunc_type = "post"
pad_type = "post"
oov_tok = "<OOV>"

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(data.cleaned_text.values)

## Convert text to sequence

In [7]:
sequences = tokenizer.texts_to_sequences(data.cleaned_text.values)
padded_sequences = pad_sequences(sequences, maxlen=maxlen, truncating=trunc_type, padding=pad_type)


## Convert labels to categorical

In [8]:
types = np.unique(data["type"].values)
def get_type_index(string):
    return list(types).index(string)

data["type_index"] = data["type"].apply(get_type_index)
labels = tf.keras.utils.to_categorical(data.type_index.values, num_classes=16)

In [9]:
from sklearn.model_selection import train_test_split
_, test_X, _, test_y = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

## Load trained models

In [10]:
print("Loading trained models...")
lstm_model = load_model("models/lstm_model.h5")
bilstm_model = load_model("models/bilstm_model.h5")

from transformers import TFBertModel
from tensorflow.keras.utils import custom_object_scope
from tensorflow.keras.models import load_model

with custom_object_scope({'TFBertModel': TFBertModel}):
    bert_model = load_model("models/bert_model.h5")

Loading trained models...


2025-02-21 11:30:01.654268: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-21 11:30:01.768777: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-21 11:30:01.768964: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

## Evaluate the models

In [11]:
print("Evaluating models...\n")
lstm_loss, lstm_acc = lstm_model.evaluate(test_X, test_y, verbose=1)
bilstm_loss, bilstm_acc = bilstm_model.evaluate(test_X, test_y, verbose=1)

Evaluating models...



2025-02-21 11:30:42.548099: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902




In [12]:
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
test_input_ids = np.array([bert_tokenizer.encode(str(i), max_length=maxlen, pad_to_max_length=True) for i in data.cleaned_text.values])
_, test_input_ids, _, test_y = train_test_split(test_input_ids, labels, test_size=0.2, random_state=42)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [13]:
bert_loss, bert_acc = bert_model.evaluate(test_input_ids, test_y, verbose=1)



## Model comparison

In [14]:
import pandas as pd
results_df = pd.DataFrame({
    "Model": ["LSTM", "Bi-Directional LSTM", "BERT"],
    "Accuracy": [lstm_acc, bilstm_acc, bert_acc],
    "Loss": [lstm_loss, bilstm_loss, bert_loss]
})

print("Model Performance Comparison:")
print(results_df)

results_df.to_csv("model_comparison_results.csv", index=False)

print("Evaluation complete! Results saved in 'model_comparison_results.csv'.")

Model Performance Comparison:
                 Model  Accuracy      Loss
0                 LSTM  0.254179  2.198353
1  Bi-Directional LSTM  0.530259  1.713998
2                 BERT  0.858213  0.650264
Evaluation complete! Results saved in 'model_comparison_results.csv'.
