In [37]:
# Imports 
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping
import re

In [38]:
# Load the dataset
data = pd.read_csv('mbti_1.csv', encoding='latin1', on_bad_lines='skip')

# Display the first few rows of the dataset
print(data.head())

   type                                              posts
0  INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1  ENTP  'I'm finding the lack of me in these posts ver...
2  INTP  'Good one  _____   https://www.youtube.com/wat...
3  INTJ  'Dear INTP,   I enjoyed our conversation the o...
4  ENTJ  'You're fired.|||That's another silly misconce...


In [39]:
# Split posts into lists
data['posts'] = data['posts'].apply(lambda x: x.split('|||'))

# Display the first few rows of the dataset
print(data.head())


   type                                              posts
0  INFJ  ['http://www.youtube.com/watch?v=qsXHcwe3krw, ...
1  ENTP  ['I'm finding the lack of me in these posts ve...
2  INTP  ['Good one  _____   https://www.youtube.com/wa...
3  INTJ  ['Dear INTP,   I enjoyed our conversation the ...
4  ENTJ  ['You're fired., That's another silly misconce...


tokenization + padding

In [40]:
# Now that the data has been split into individual posts, we can start cleaning that and then tokenizing it
# we can use the tokeniser on each post from each row

# As there are also some links and special characters in the posts, we can remove them using regex

# Clean and preprocess text
def clean_text(text):
    # Remove links
    text = re.sub(r'http\S+', '', text)
    # Remove special characters and digits (including spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove single characters
    text = ' '.join([w for w in text.split() if len(w) > 1])
    return text

data['posts'] = data['posts'].apply(lambda x: [clean_text(post) for post in x])

# Display cleaned posts
print(data.head())

   type                                              posts
0  INFJ  [, , enfp and intj moments sportscenter not to...
1  ENTP  [Im finding the lack of me in these posts very...
2  INTP  [Good one, Of course to which say know thats m...
3  INTJ  [Dear INTP enjoyed our conversation the other ...
4  ENTJ  [Youre fired, Thats another silly misconceptio...


In [None]:
# USE NLTK TO REMOVE STOPWORDS - Manya

In [46]:
# Split data into training and testing sets
trainData, testData = train_test_split(data, test_size=0.2, random_state=42)

# Now we can tokenize the data

# We should tokenise each post in each row of the data and then pad the sequences
# This will allow us to feed the data into the model

# Given that we want to create train and test variables to store the tokenised data, we should store them as such

train = trainData['posts']
test = testData['posts']

# We can now fit the tokenizer on the training and test data
# Tokenization and sequence padding
tokenizer_train = Tokenizer()
tokenizer_train.fit_on_texts(trainData['posts'])
tokenizer_test = Tokenizer()
tokenizer_test.fit_on_texts(testData['posts'])

trainSequences = tokenizer_train.texts_to_sequences(trainData['posts'])
testSequences = tokenizer_test.texts_to_sequences(testData['posts'])

maxlen = 100
trainSeq = pad_sequences(trainSequences, maxlen=maxlen, padding='post', truncating='post')
testSeq = pad_sequences(testSequences, maxlen=maxlen, padding='post', truncating='post')

# Displaying the data to ensure that it has been tokenised and padded correctly
print("tokenised and padded data for train:")
# Print the first 5 rows of the data
print(trainSeq[:5])

print("The shape is ", trainSeq.shape)


tokenised and padded data for train:
[[2091 2092  667 2093 2094  668 2095 2096 2097 2098 2099 2100 2101 2102
  2103  432 2104 2105    1 2106 2107 2108 2109 2110 2111 2112 2113 2114
  2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128
  2129 2130  669 2131 2132 2133 2134 2135    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [2136 2137 2138  670 2139 2140 2141 2142 2143 2144  143 2145 2146 2147
  2148 2149 2150 2151 2152 2153 2154 2155  671 2156 2157 2158 2159 2160
  2161 2162 2163 2164 2165 2166 2167 2168  300 2169 2170 2171 2172 2173
  2174 2175 2176 2177 2178 2179 2180 2181    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0   

encoding

In [43]:
# Encode MBTI type labels
labelEnc = LabelEncoder()
labelEnc.fit(data['type'])

trainLabels = labelEnc.transform(trainData['type'])
testLabels = labelEnc.transform(testData['type'])

# Display encoded labels and their counts in the training data
display(data['type'].value_counts())
display(trainLabels)

type
INFP    1823
INFJ    1460
INTP    1297
INTJ    1083
ENTP     684
ENFP     672
ISTP     335
ISFP     269
ENTJ     228
ISTJ     202
ENFJ     190
ISFJ     165
ESTP      88
ESFP      48
ESFJ      42
ESTJ      39
Name: count, dtype: int64

array([ 9,  0,  9, ..., 10, 10,  1])

model definition

In [44]:
# Model Architecture
embedding_dim = 100
lstm_units = 128
num_classes = len(labelEnc.classes_)

model = Sequential()
model.add(Embedding(len(tokenizer_train.word_index) + 1, embedding_dim))
model.add(LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

In [45]:
# Compile the model
optimizer = RMSprop(learning_rate=0.001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
num_epochs = 10
batch_size = 32
validation_split = 0.2
patience = 3

model.fit(trainSeq, trainLabels, epochs=num_epochs, batch_size=batch_size,
          validation_split=validation_split, callbacks=[EarlyStopping(patience=patience)])


Epoch 1/10


[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 84ms/step - accuracy: 0.1889 - loss: 2.3515 - val_accuracy: 0.1826 - val_loss: 2.2572
Epoch 2/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 84ms/step - accuracy: 0.1892 - loss: 2.3122 - val_accuracy: 0.1826 - val_loss: 2.2504
Epoch 3/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 81ms/step - accuracy: 0.1884 - loss: 2.2987 - val_accuracy: 0.2232 - val_loss: 2.2510
Epoch 4/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 83ms/step - accuracy: 0.1953 - loss: 2.3009 - val_accuracy: 0.2232 - val_loss: 2.2482
Epoch 5/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 83ms/step - accuracy: 0.1939 - loss: 2.3059 - val_accuracy: 0.2232 - val_loss: 2.2560
Epoch 6/10
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 84ms/step - accuracy: 0.1979 - loss: 2.2941 - val_accuracy: 0.2232 - val_loss: 2.2493
Epoch 7/10
[1m173/173[0m 

<keras.src.callbacks.history.History at 0x35666d940>

Evaluating model

In [47]:
testLoss, testAcc = model.evaluate(testSeq, testLabels)
print(f"Test Loss: {testLoss}")
print(f"Test Accuracy: {testAcc}")

[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.2260 - loss: 2.3150
Test Loss: 2.2814767360687256
Test Accuracy: 0.2278260886669159


In [48]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [49]:
# Predict probabilities for each class
y_pred_probabilities = model.predict(testSeq)

# Convert probabilities to class labels (argmax to get the index of the highest probability)
y_pred = np.argmax(y_pred_probabilities, axis=1)

# Calculate accuracy
accuracy = accuracy_score(testLabels, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate precision, recall, and F1-score
precision = precision_score(testLabels, y_pred, average='weighted')
recall = recall_score(testLabels, y_pred, average='weighted')
f1 = f1_score(testLabels, y_pred, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Calculate confusion matrix
conf_matrix = confusion_matrix(testLabels, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step
Accuracy: 0.2278
Precision: 0.0519
Recall: 0.2278
F1-score: 0.0845
Confusion Matrix:
[[  0   0   0   0   0   0   0   0   0  33   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 114   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  36   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 126   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  10   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   6   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  21   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 262   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 393   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 258   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 244   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Five fold cross validation

In [50]:
from sklearn.model_selection import KFold, StratifiedKFold

In [51]:
# Define number of folds
n_splits = 5

# Initialize StratifiedKFold for classification
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [52]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define number of folds
n_splits = 5

# Initialize StratifiedKFold for classification
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics across folds
accuracies = []
precisions = []
recalls = []
f1_scores = []
confusion_matrices = []

for train_idx, val_idx in skf.split(data['posts'], data['type']):
    # Split data into train and validation sets based on fold indices
    X_train, X_val = data['posts'][train_idx], data['posts'][val_idx]
    y_train, y_val = data['type'][train_idx], data['type'][val_idx]

    # Encode MBTI type labels to numeric labels
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_val_encoded = label_encoder.transform(y_val)

    # Tokenization and Padding
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    max_len = 100  # Adjust max length as needed

    X_train_sequences = tokenizer.texts_to_sequences(X_train)
    X_val_sequences = tokenizer.texts_to_sequences(X_val)

    X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len, padding='post', truncating='post')
    X_val_padded = pad_sequences(X_val_sequences, maxlen=max_len, padding='post', truncating='post')

    # Define and compile LSTM model
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_len))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Output layer with number of classes

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train LSTM model
    model.fit(X_train_padded, y_train_encoded, epochs=10, batch_size=32, verbose=0)

    # Evaluate LSTM model on validation data
    y_pred_encoded = np.argmax(model.predict(X_val_padded), axis=1)

    # Decode predicted labels back to MBTI types
    y_pred_decoded = label_encoder.inverse_transform(y_pred_encoded)

    # Calculate evaluation metrics for this fold
    accuracy = accuracy_score(y_val, y_pred_decoded)
    precision = precision_score(y_val, y_pred_decoded, average='weighted')
    recall = recall_score(y_val, y_pred_decoded, average='weighted')
    f1 = f1_score(y_val, y_pred_decoded, average='weighted')
    conf_matrix = confusion_matrix(y_val, y_pred_decoded)

    # Append metrics to lists
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    confusion_matrices.append(conf_matrix)

    # Print or display metrics for this fold
    print(f"Fold Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)

# Calculate average metrics across all folds
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1 = np.mean(f1_scores)

# Print or display average metrics across all folds
print("\nAverage Metrics Across Folds:")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-score: {avg_f1:.4f}")

# Display average confusion matrix
avg_conf_matrix = np.mean(confusion_matrices, axis=0)
print("Average Confusion Matrix:")
print(avg_conf_matrix)

