In [1]:
# Imports 
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping
import re
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [20]:
# Load the dataset
data = pd.read_csv('mbti_1.csv', encoding='latin1', on_bad_lines='skip')

# Display the first few rows of the dataset
print(data.head())

   type                                              posts
0  INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1  ENTP  'I'm finding the lack of me in these posts ver...
2  INTP  'Good one  _____   https://www.youtube.com/wat...
3  INTJ  'Dear INTP,   I enjoyed our conversation the o...
4  ENTJ  'You're fired.|||That's another silly misconce...


In [21]:
# Split posts into lists
data['posts'] = data['posts'].apply(lambda x: x.split('|||'))

# Display the first few rows of the dataset
print(data.head())


   type                                              posts
0  INFJ  ['http://www.youtube.com/watch?v=qsXHcwe3krw, ...
1  ENTP  ['I'm finding the lack of me in these posts ve...
2  INTP  ['Good one  _____   https://www.youtube.com/wa...
3  INTJ  ['Dear INTP,   I enjoyed our conversation the ...
4  ENTJ  ['You're fired., That's another silly misconce...


tokenization + padding

In [17]:
# Now that the data has been split into individual posts, we can start cleaning that and then tokenizing it
# we can use the tokeniser on each post from each row

# As there are also some links and special characters in the posts, we can remove them using regex

# Clean and preprocess text
def clean_text(text):
    # Remove links
    text = re.sub(r'http\S+', '', text)
    # Remove special characters and digits (including spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove single characters
    text = ' '.join([w for w in text.split() if len(w) > 1])
    return text

data['posts'] = data['posts'].apply(lambda x: [clean_text(post) for post in x])

# Display cleaned posts
print(data.head())

   type                                              posts
0  INFJ  [, , enfp and intj moments sportscenter not to...
1  ENTP  [Im finding the lack of me in these posts very...
2  INTP  [Good one, Of course to which say know thats m...
3  INTJ  [Dear INTP enjoyed our conversation the other ...
4  ENTJ  [Youre fired, Thats another silly misconceptio...


In [22]:
# Now that the data has been split into individual posts, we can start cleaning that and then tokenizing it
# we can use the tokeniser on each post from each row

# As there are also some links and special characters in the posts, we can remove them using regex

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Clean and preprocess text
def clean_post(post):
    # Remove links
    post = re.sub(r'http\S+', '', post)
    # Remove special characters and digits (including spaces)
    post = re.sub(r'[^a-zA-Z\s]', '', post)
    # Convert to lowercase and split into words
    words = post.lower().split()
    # Remove stopwords
    words = [w for w in words if w not in stop_words]
    # Remove single characters
    words = [w for w in words if len(w) > 1]
    # Join words back into a cleaned post
    cleaned_post = ' '.join(words)
    return cleaned_post

# Apply the clean_post function to each post in the 'posts' column
data['cleaned_posts'] = data['posts'].apply(lambda posts: [clean_post(post) for post in posts])

# Display cleaned posts
print(data.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   type                                              posts  \
0  INFJ  ['http://www.youtube.com/watch?v=qsXHcwe3krw, ...   
1  ENTP  ['I'm finding the lack of me in these posts ve...   
2  INTP  ['Good one  _____   https://www.youtube.com/wa...   
3  INTJ  ['Dear INTP,   I enjoyed our conversation the ...   
4  ENTJ  ['You're fired., That's another silly misconce...   

                                       cleaned_posts  
0  [, , enfp intj moments sportscenter top ten pl...  
1  [im finding lack posts alarming, sex boring po...  
2  [good one, course say know thats blessing curs...  
3  [dear intp enjoyed conversation day esoteric g...  
4  [youre fired, thats another silly misconceptio...  


In [24]:
# Split data into training and testing sets
trainData, testData = train_test_split(data, test_size=0.2, random_state=42)

# Now we can tokenize the data

# We should tokenise each post in each row of the data and then pad the sequences
# This will allow us to feed the data into the model

# Given that we want to create train and test variables to store the tokenised data, we should store them as such

train = trainData['posts']
test = testData['posts']

# We can now fit the tokenizer on the training and test data
# Tokenization and sequence padding
tokenizer_train = Tokenizer()
tokenizer_train.fit_on_texts(trainData['posts'])
tokenizer_test = Tokenizer()
tokenizer_test.fit_on_texts(testData['posts'])

trainSequences = tokenizer_train.texts_to_sequences(trainData['posts'])
testSequences = tokenizer_test.texts_to_sequences(testData['posts'])

maxlen = 100
trainSeq = pad_sequences(trainSequences, maxlen=maxlen, padding='post', truncating='post')
testSeq = pad_sequences(testSequences, maxlen=maxlen, padding='post', truncating='post')

# Displaying the data to ensure that it has been tokenised and padded correctly
print("tokenised and padded data for train:")
# Print the first 5 rows of the data
print(trainSeq[:5])

print("The shape is ", trainSeq.shape)


tokenised and padded data for train:
[[1534 1535  443 1536 1537  444 1538 1539 1540 1541 1542 1543 1544 1545
  1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559
  1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573
  1574 1575 1576 1577 1578 1579 1580 1581    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [1582 1583 1584 1585 1586 1587 1588 1589 1590 1591  445 1592 1593 1594
  1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608
  1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622
  1623 1624 1625 1626 1627 1628 1629 1630    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0   

encoding

In [25]:
# Encode MBTI type labels
labelEnc = LabelEncoder()
labelEnc.fit(data['type'])

trainLabels = labelEnc.transform(trainData['type'])
testLabels = labelEnc.transform(testData['type'])

# Display encoded labels and their counts in the training data
display(data['type'].value_counts())
display(trainLabels)

INFP    1823
INFJ    1460
INTP    1297
INTJ    1083
ENTP     684
ENFP     672
ISTP     335
ISFP     269
ENTJ     228
ISTJ     202
ENFJ     190
ISFJ     165
ESTP      88
ESFP      48
ESFJ      42
ESTJ      39
Name: type, dtype: int64

array([ 9,  0,  9, ..., 10, 10,  1])

model definition

In [26]:
# Model Architecture
embedding_dim = 100
lstm_units = 128
num_classes = len(labelEnc.classes_)

model = Sequential()
model.add(Embedding(len(tokenizer_train.word_index) + 1, embedding_dim))
model.add(LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

In [None]:
# Compile the model
optimizer = RMSprop(learning_rate=0.001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
num_epochs = 10
batch_size = 32
validation_split = 0.2
patience = 3

model.fit(trainSeq, trainLabels, epochs=num_epochs, batch_size=batch_size,
          validation_split=validation_split, callbacks=[EarlyStopping(patience=patience)])


model definition updated

In [36]:
embedding_dim = 100
lstm_units = 128
num_classes = len(labelEnc.classes_)

model = Sequential()
model.add(Embedding(len(tokenizer_train.word_index) + 1, embedding_dim, input_length=maxlen))
model.add(SpatialDropout1D(0.2))  # Spatial dropout to prevent overfitting

model.add(LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2))

model.add(Dense(num_classes, activation='softmax'))

In [37]:
optimizer = Adam(learning_rate=0.001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [38]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          33152300  
                                                                 
 spatial_dropout1d (SpatialD  (None, 100, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm_1 (LSTM)               (None, 128)               117248    
                                                                 
 dense_1 (Dense)             (None, 16)                2064      
                                                                 
Total params: 33,271,612
Trainable params: 33,271,612
Non-trainable params: 0
_________________________________________________________________


In [41]:
# Train the model
num_epochs = 20
batch_size = 32
validation_split = 0.2
patience = 3

model.fit(trainSeq, trainLabels, epochs=num_epochs, batch_size=batch_size,
          validation_split=validation_split, callbacks=[EarlyStopping(patience=patience)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


<keras.callbacks.History at 0x25a8c8dc088>

Evaluating model

In [42]:
testLoss, testAcc = model.evaluate(testSeq, testLabels)
print(f"Test Loss: {testLoss}")
print(f"Test Accuracy: {testAcc}")

Test Loss: 2.28373384475708
Test Accuracy: 0.2278260886669159


In [29]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [30]:
# Predict probabilities for each class
y_pred_probabilities = model.predict(testSeq)

# Convert probabilities to class labels - argmax to get the index of the highest probability
y_pred = np.argmax(y_pred_probabilities, axis=1)

accuracy = accuracy_score(testLabels, y_pred)
precision = precision_score(testLabels, y_pred, average='weighted')
recall = recall_score(testLabels, y_pred, average='weighted')
f1 = f1_score(testLabels, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

conf_matrix = confusion_matrix(testLabels, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.2278
Precision: 0.0519
Recall: 0.2278
F1-score: 0.0845
Confusion Matrix:
[[  0   0   0   0   0   0   0   0   0  33   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 114   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  36   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 126   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   8   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  10   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   6   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  21   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 262   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 393   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 258   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 244   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  39   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0  61   

  _warn_prf(average, modifier, msg_start, len(result))


Five fold cross validation

In [31]:
from sklearn.model_selection import KFold, StratifiedKFold

In [32]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [52]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

accuracies = []
precisions = []
recalls = []
f1_scores = []
confusion_matrices = []

for train_idx, val_idx in skf.split(data['posts'], data['type']):
    X_train, X_val = data['posts'][train_idx], data['posts'][val_idx]
    y_train, y_val = data['type'][train_idx], data['type'][val_idx]

    # Label Encoding
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_val_encoded = label_encoder.transform(y_val)

    # Tokenization + Padding
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    max_len = 100 

    X_train_sequences = tokenizer.texts_to_sequences(X_train)
    X_val_sequences = tokenizer.texts_to_sequences(X_val)

    X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len, padding='post', truncating='post')
    X_val_padded = pad_sequences(X_val_sequences, maxlen=max_len, padding='post', truncating='post')

    # Redefine and compile LSTM model
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_len))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Output layer with number of classes

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(X_train_padded, y_train_encoded, epochs=10, batch_size=32, verbose=0)

    # Evaluate model on val data
    y_pred_encoded = np.argmax(model.predict(X_val_padded), axis=1)

    # Decode predicted labels back to MBTI types
    y_pred_decoded = label_encoder.inverse_transform(y_pred_encoded)

    # Calculating evaluation metrics for this fold
    accuracy = accuracy_score(y_val, y_pred_decoded)
    precision = precision_score(y_val, y_pred_decoded, average='weighted')
    recall = recall_score(y_val, y_pred_decoded, average='weighted')
    f1 = f1_score(y_val, y_pred_decoded, average='weighted')
    conf_matrix = confusion_matrix(y_val, y_pred_decoded)

    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    confusion_matrices.append(conf_matrix)

    print(f"Fold Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
    print("Confusion Matrix:")
    print(conf_matrix)

# Average metrics across all folds
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1 = np.mean(f1_scores)

print("\nAverage Metrics Across Folds:")
print(f"Average Accuracy: {avg_accuracy:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1-score: {avg_f1:.4f}")

# Average confusion matrix
avg_conf_matrix = np.mean(confusion_matrices, axis=0)
print("Average Confusion Matrix:")
print(avg_conf_matrix)



Accept use input for real-time testing

In [None]:
def preprocess_single_post(text):
    # Preprocessing
    cleaned_text = clean_text(text)
    
    # Tokenizing + Padding
    sequence = tokenizer_train.texts_to_sequences([cleaned_text])
    padded_sequence = pad_sequences(sequence, maxlen=maxlen, padding='post', truncating='post')
    
    return padded_sequence

# Function to predict personality type from a single post
def predict_personality_type(model, text):
    # Preprocessing
    preprocessed_text = preprocess_single_post(text)
    
    predictions = model.predict(preprocessed_text)
    predicted_class = np.argmax(predictions, axis=1)[0] 
    
    # Mapping the predicted class index to the personality type (MBTI) label
    predicted_personality_type = labelEnc.inverse_transform([predicted_class])[0]
    
    return predicted_personality_type

user_input = input("Enter a text post: ")
predicted_type = predict_personality_type(model, user_input)

print("Predicted Personality Type:", predicted_type)