In [22]:
import pandas as pd
import tensorflow as tf

In [3]:
data = pd.read_csv("./mbti_1.csv")

In [4]:
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [5]:
def take_row(row):
    type = row.iloc[0]
    
    if type[0] == 'I': I = 1
    elif type[0] == 'E': I =  0
    else: print('I-E value not found')

    if type[1] == 'N': N = 1
    elif type[1] == 'S': N =  0
    else: print('N-S value not found')

    if type[2] == 'T': T = 1
    elif type[2] == 'F': T =  0
    else: print('T-F value not found')

    if type[3] == 'J': J = 1
    elif type[3] == 'P': J =  0
    else: print('J-P value not found')

    return pd.Series({"I/E" : I, "N/S": N, "T/F": T, "J/P": J})

In [6]:
data = data.join(data.apply(lambda row: take_row(row), axis = 1))
data.head()

Unnamed: 0,type,posts,I/E,N/S,T/F,J/P
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,1,1,0,1
1,ENTP,'I'm finding the lack of me in these posts ver...,0,1,1,0
2,INTP,'Good one _____ https://www.youtube.com/wat...,1,1,1,0
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",1,1,1,1
4,ENTJ,'You're fired.|||That's another silly misconce...,0,1,1,1


In [7]:
data["I/E"].value_counts()
data["N/S"].value_counts()
data["T/F"].value_counts()
# data["J/P"].value_counts()

T/F
0    4694
1    3981
Name: count, dtype: int64

In [8]:
data.groupby("N/S")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1691e7890>

In [9]:
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopwords = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
unique_type_list = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
       'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']

def preprocessing(post):
    text = post

    # remove the pipe character |
    text = text.replace("|","")

    # remove the links
    text = re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", text)

    # remove all the punctuations and keep the words
    text = re.sub("[^a-zA-Z]", " ", text).lower()

    # remove the unwanted spaces > 1
    text = re.sub(' +', ' ', text)

    # tokenize the text
    tokens = word_tokenize(text)

    # remove the stopwords, mbit mentions, and lammetization
    temp = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]
    text = " ".join([word for word in temp if word not in [x.lower() for x in unique_type_list]])
    
    return text.strip()

In [14]:
data["processed_text"] = data["posts"].apply(preprocessing)

In [15]:
data.head()

Unnamed: 0,type,posts,I/E,N/S,T/F,J/P,processed_text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,1,1,0,1,moment sportscenter top ten play prankswhat li...
1,ENTP,'I'm finding the lack of me in these posts ver...,0,1,1,0,finding lack post alarming sex boring position...
2,INTP,'Good one _____ https://www.youtube.com/wat...,1,1,1,0,good one course say know blessing curse absolu...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",1,1,1,1,dear enjoyed conversation day esoteric gabbing...
4,ENTJ,'You're fired.|||That's another silly misconce...,0,1,1,1,fired another silly misconception approaching ...


In [16]:
data = data.drop(["type","posts"], axis = 1)

In [17]:
data.head()

Unnamed: 0,I/E,N/S,T/F,J/P,processed_text
0,1,1,0,1,moment sportscenter top ten play prankswhat li...
1,0,1,1,0,finding lack post alarming sex boring position...
2,1,1,1,0,good one course say know blessing curse absolu...
3,1,1,1,1,dear enjoyed conversation day esoteric gabbing...
4,0,1,1,1,fired another silly misconception approaching ...


In [18]:
data.to_csv("processed_mbti.csv", index = False)

In [19]:
print(data.processed_text.head())
print(data.processed_text.isna().sum())  # Count of NaN values
print(data.processed_text.apply(type).value_counts())  # Types present

0    moment sportscenter top ten play prankswhat li...
1    finding lack post alarming sex boring position...
2    good one course say know blessing curse absolu...
3    dear enjoyed conversation day esoteric gabbing...
4    fired another silly misconception approaching ...
Name: processed_text, dtype: object
0
processed_text
<class 'str'>    8675
Name: count, dtype: int64


In [20]:
from sklearn.model_selection import train_test_split

text = data.processed_text.astype("string")
labels = data[["I/E", "N/S", "T/F", "J/P"]]

X_train_full, X_test, y_train_full, y_train = train_test_split(text, labels, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, random_state = 42)

In [23]:
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense, Input
from tensorflow.keras.callbacks import EarlyStopping

embedding_dim = 128
max_tokens = 10_000

vectorizer = tf.keras.layers.TextVectorization(
    max_tokens = 10_000,
    output_mode = 'int',
    output_sequence_length = 250
)

vectorizer.adapt(data.processed_text)

# Input layer
input_layer = Input(shape=(1,), dtype="string", name='input_text')

# Vectorizer layer
x = vectorizer(input_layer)

# Shared layers
x = Embedding(input_dim = max_tokens, output_dim = embedding_dim)(x)
x = Bidirectional(LSTM(200, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(x)
x = Dropout(0.2)(x)
x = Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2))(x)
x = Dropout(0.2)(x)
x = Dense(20, activation="relu", kernel_initializer='he_normal')(x)

# Separate outputs
output_IE = Dense(1, activation='sigmoid', name='IE_output')(x)
output_NS = Dense(1, activation='sigmoid', name='NS_output')(x)
output_TF = Dense(1, activation='sigmoid', name='TF_output')(x)
output_JP = Dense(1, activation='sigmoid', name='JP_output')(x)


model = tf.keras.Model(inputs=input_layer, outputs=[output_IE, output_NS, output_TF, output_JP])

In [24]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics={'IE_output': 'accuracy',
             'NS_output': 'accuracy',
             'TF_output': 'accuracy',
             'JP_output': 'accuracy'}
)

In [25]:
# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Training labels for multiple outputs
train_labels = {'IE_output': y_train.iloc[:, 0],
                'NS_output': y_train.iloc[:, 1],
                'TF_output': y_train.iloc[:, 2],
                'JP_output': y_train.iloc[:, 3]}

val_labels = {'IE_output': y_val.iloc[:, 0],
               'NS_output': y_val.iloc[:, 1],
               'TF_output': y_val.iloc[:, 2],
               'JP_output': y_val.iloc[:, 3]}


# Train the model
history = model.fit(
    X_train, train_labels,
    validation_data=(X_val, val_labels),
    batch_size=32,
    epochs=20,
    callbacks=[early_stopping]
)

Epoch 1/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 573ms/step - IE_output_accuracy: 0.7511 - IE_output_loss: 0.5705 - JP_output_accuracy: 0.6032 - JP_output_loss: 0.6786 - NS_output_accuracy: 0.8483 - NS_output_loss: 0.4483 - TF_output_accuracy: 0.5190 - TF_output_loss: 0.7021 - loss: 2.3995 - val_IE_output_accuracy: 0.7615 - val_IE_output_loss: 0.5491 - val_JP_output_accuracy: 0.5864 - val_JP_output_loss: 0.6791 - val_NS_output_accuracy: 0.8556 - val_NS_output_loss: 0.4155 - val_TF_output_accuracy: 0.5378 - val_TF_output_loss: 0.6921 - val_loss: 2.3353
Epoch 2/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 592ms/step - IE_output_accuracy: 0.7693 - IE_output_loss: 0.5352 - JP_output_accuracy: 0.5998 - JP_output_loss: 0.6743 - NS_output_accuracy: 0.8654 - NS_output_loss: 0.3731 - TF_output_accuracy: 0.5750 - TF_output_loss: 0.6737 - loss: 2.2563 - val_IE_output_accuracy: 0.7615 - val_IE_output_loss: 0.5496 - val_JP_output_accuracy: 0.586