In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [4]:
data = pd.read_csv("data/processed_mbti.csv")

In [5]:
data.head()

Unnamed: 0,I/E,N/S,T/F,J/P,processed_text
0,1,1,0,1,moment sportscenter top ten play prankswhat li...
1,0,1,1,0,finding lack post alarming sex boring position...
2,1,1,1,0,good one course say know blessing curse absolu...
3,1,1,1,1,dear enjoyed conversation day esoteric gabbing...
4,0,1,1,1,fired another silly misconception approaching ...


In [6]:
print(data.processed_text.isna().sum())  # Count of NaN values
print(data.processed_text.apply(type).value_counts())  # Types present

data['processed_text'] = data['processed_text'].fillna("")  # Replace NaN with empty strings
data['processed_text'] = data['processed_text'].astype(str)  # Convert all to strings

print(data.processed_text.isna().sum())  # Count of NaN values
print(data.processed_text.apply(type).value_counts())  # Types present

1
processed_text
<class 'str'>      8674
<class 'float'>       1
Name: count, dtype: int64
0
processed_text
<class 'str'>    8675
Name: count, dtype: int64


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   I/E             8675 non-null   int64 
 1   N/S             8675 non-null   int64 
 2   T/F             8675 non-null   int64 
 3   J/P             8675 non-null   int64 
 4   processed_text  8675 non-null   object
dtypes: int64(4), object(1)
memory usage: 339.0+ KB


In [8]:
text = data.processed_text.astype("string")
labels = data[["I/E", "N/S", "T/F", "J/P"]]

In [9]:
from sklearn.model_selection import train_test_split

X_train_full, X_test, y_train_full, y_test = train_test_split(text, labels, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, random_state = 42)

In [10]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((4879,), (4879, 4), (1627,), (1627, 4))

In [11]:
X_train.info(), y_train.info()

<class 'pandas.core.series.Series'>
Index: 4879 entries, 7978 to 8342
Series name: processed_text
Non-Null Count  Dtype 
--------------  ----- 
4879 non-null   string
dtypes: string(1)
memory usage: 76.2 KB
<class 'pandas.core.frame.DataFrame'>
Index: 4879 entries, 7978 to 8342
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   I/E     4879 non-null   int64
 1   N/S     4879 non-null   int64
 2   T/F     4879 non-null   int64
 3   J/P     4879 non-null   int64
dtypes: int64(4)
memory usage: 190.6 KB


(None, None)

In [12]:
print(X_train.dtypes)

string


In [13]:
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense, Input
from tensorflow.keras.callbacks import EarlyStopping

embedding_dim = 128
max_tokens = 10_000

vectorizer = tf.keras.layers.TextVectorization(
    max_tokens = 10_000,
    output_mode = 'int',
    output_sequence_length = 250
)

vectorizer.adapt(data.processed_text)

# Input layer
input_layer = Input(shape=(1,), dtype="string", name='input_text')

# Vectorizer layer
x = vectorizer(input_layer)

# Shared layers
x = Embedding(input_dim = max_tokens, output_dim = embedding_dim)(x)
x = Bidirectional(LSTM(200, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(x)
x = Dropout(0.2)(x)
x = Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2))(x)
x = Dropout(0.2)(x)
x = Dense(20, activation="relu", kernel_initializer='he_normal')(x)

# Separate outputs
output_IE = Dense(1, activation='sigmoid', name='IE_output')(x)
output_NS = Dense(1, activation='sigmoid', name='NS_output')(x)
output_TF = Dense(1, activation='sigmoid', name='TF_output')(x)
output_JP = Dense(1, activation='sigmoid', name='JP_output')(x)


model = tf.keras.Model(inputs=input_layer, outputs=[output_IE, output_NS, output_TF, output_JP])

In [14]:
model.summary()

In [15]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics={'IE_output': 'accuracy',
             'NS_output': 'accuracy',
             'TF_output': 'accuracy',
             'JP_output': 'accuracy'}
)

In [16]:
y_train.iloc[:,0]

7978    1
2566    1
1360    1
2086    1
1964    1
       ..
163     0
4895    0
989     0
950     1
8342    1
Name: I/E, Length: 4879, dtype: int64

In [17]:
# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Training labels for multiple outputs
train_labels = {'IE_output': y_train.iloc[:, 0],
                'NS_output': y_train.iloc[:, 1],
                'TF_output': y_train.iloc[:, 2],
                'JP_output': y_train.iloc[:, 3]}

val_labels = {'IE_output': y_val.iloc[:, 0],
               'NS_output': y_val.iloc[:, 1],
               'TF_output': y_val.iloc[:, 2],
               'JP_output': y_val.iloc[:, 3]}


# Train the model
history = model.fit(
    X_train, train_labels,
    validation_data=(X_val, val_labels),
    batch_size=32,
    epochs=20,
    callbacks=[early_stopping]
)

Epoch 1/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 526ms/step - IE_output_accuracy: 0.7494 - IE_output_loss: 0.5796 - JP_output_accuracy: 0.6063 - JP_output_loss: 0.6796 - NS_output_accuracy: 0.8580 - NS_output_loss: 0.4518 - TF_output_accuracy: 0.5093 - TF_output_loss: 0.6958 - loss: 2.4068 - val_IE_output_accuracy: 0.7615 - val_IE_output_loss: 0.5496 - val_JP_output_accuracy: 0.5864 - val_JP_output_loss: 0.6781 - val_NS_output_accuracy: 0.8556 - val_NS_output_loss: 0.4207 - val_TF_output_accuracy: 0.5433 - val_TF_output_loss: 0.6858 - val_loss: 2.3336
Epoch 2/20
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 558ms/step - IE_output_accuracy: 0.7514 - IE_output_loss: 0.5413 - JP_output_accuracy: 0.6131 - JP_output_loss: 0.6632 - NS_output_accuracy: 0.8631 - NS_output_loss: 0.3716 - TF_output_accuracy: 0.6243 - TF_output_loss: 0.6482 - loss: 2.2243 - val_IE_output_accuracy: 0.7609 - val_IE_output_loss: 0.5523 - val_JP_output_accuracy: 0.590

In [18]:
test_labels = {'IE_output': y_test.iloc[:, 0],
               'NS_output': y_test.iloc[:, 1],
               'TF_output': y_test.iloc[:, 2],
               'JP_output': y_test.iloc[:, 3]}

model.evaluate(X_test,test_labels)

[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 166ms/step - IE_output_accuracy: 0.7840 - IE_output_loss: 0.5270 - JP_output_accuracy: 0.6313 - JP_output_loss: 0.6636 - NS_output_accuracy: 0.8576 - NS_output_loss: 0.4170 - TF_output_accuracy: 0.5743 - TF_output_loss: 0.6786 - loss: 2.2861


[2.28859281539917,
 0.5259314179420471,
 0.4135143756866455,
 0.680133581161499,
 0.6687912940979004,
 0.7833102941513062,
 0.6118026971817017,
 0.8584601283073425,
 0.5758414268493652]

In [19]:
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)

y_train shape: (4879, 4)
y_val shape: (1627, 4)


In [21]:
model.save("saved_models/LSTM_based_model.keras")

In [22]:
vocab_path = "saved_models/vectorizer_vocab.txt"
vocabulary = vectorizer.get_vocabulary()  # Get the vocabulary list
with open(vocab_path, "w") as f:
    for word in vocabulary:
        f.write(f"{word}\n")

## Model Using Bert

In [41]:
import transformers

In [42]:
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-large-uncased')

In [44]:
train_encodings = tokenizer(
    list(X_train),
    max_length= 250,
    padding='max_length',
    truncation=True,
    return_tensors="tf"
)

val_encodings = tokenizer(
    list(X_val),
    max_length=250,
    padding='max_length',
    truncation=True,
    return_tensors="tf"
)

# Extract inputs
train_input_ids = train_encodings["input_ids"]
train_attention_mask = train_encodings["attention_mask"]
train_token_type_ids = train_encodings["token_type_ids"]

In [51]:
import tensorflow as tf
from tensorflow.keras import layers
from transformers import TFBertModel

# Define a custom layer to wrap TFBertModel
class BertLayer(layers.Layer):
    def __init__(self, pretrained_model_name, **kwargs):
        super(BertLayer, self).__init__(**kwargs)
        self.bert = TFBertModel.from_pretrained(pretrained_model_name)

    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        return outputs.last_hidden_state  # Return the last hidden state

# Constants
max_length = 128

# Define input layers
input_word_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="input_word_ids")
attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")
token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="token_type_ids")

# Add BertLayer
bert_outputs = BertLayer(pretrained_model_name='bert-large-uncased')(
    [input_word_ids, attention_mask, token_type_ids]
)

# Use the CLS token for classification tasks
cls_token_output = bert_outputs[:, 0, :]

# Add additional layers
dropout = tf.keras.layers.Dropout(0.2)(cls_token_output)
output_IE = tf.keras.layers.Dense(1, activation='sigmoid', name='IE_output')(dropout)
output_NS = tf.keras.layers.Dense(1, activation='sigmoid', name='NS_output')(dropout)
output_TF = tf.keras.layers.Dense(1, activation='sigmoid', name='TF_output')(dropout)
output_JP = tf.keras.layers.Dense(1, activation='sigmoid', name='JP_output')(dropout)

# Build and compile the model
model = tf.keras.Model(
    inputs=[input_word_ids, attention_mask, token_type_ids],
    outputs=[output_IE, output_NS, output_TF, output_JP],
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
    loss='binary_crossentropy',
    metrics={'IE_output': 'accuracy',
             'NS_output': 'accuracy',
             'TF_output': 'accuracy',
             'JP_output': 'accuracy'},
)

model.summary()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [52]:
def tokenize_texts(texts, tokenizer, max_length=128):
    tokenized_data = tokenizer(
        list(texts),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='np'  # Use NumPy for easier manipulation
    )
    return tokenized_data['input_ids'], tokenized_data['attention_mask'], tokenized_data['token_type_ids']

# Tokenize the datasets
max_length = 128
X_train_input_ids, X_train_attention_mask, X_train_token_type_ids = tokenize_texts(X_train, tokenizer, max_length)
X_val_input_ids, X_val_attention_mask, X_val_token_type_ids = tokenize_texts(X_val, tokenizer, max_length)
X_test_input_ids, X_test_attention_mask, X_test_token_type_ids = tokenize_texts(X_test, tokenizer, max_length)

In [53]:
# Training inputs
X_train_inputs = {
    "input_word_ids": X_train_input_ids,
    "attention_mask": X_train_attention_mask,
    "token_type_ids": X_train_token_type_ids,
}

# Validation inputs
X_val_inputs = {
    "input_word_ids": X_val_input_ids,
    "attention_mask": X_val_attention_mask,
    "token_type_ids": X_val_token_type_ids,
}

# Testing inputs
X_test_inputs = {
    "input_word_ids": X_test_input_ids,
    "attention_mask": X_test_attention_mask,
    "token_type_ids": X_test_token_type_ids,
}

train_labels = {'IE_output': y_train.iloc[:, 0],
                'NS_output': y_train.iloc[:, 1],
                'TF_output': y_train.iloc[:, 2],
                'JP_output': y_train.iloc[:, 3]}

val_labels = {'IE_output': y_val.iloc[:, 0],
               'NS_output': y_val.iloc[:, 1],
               'TF_output': y_val.iloc[:, 2],
               'JP_output': y_val.iloc[:, 3]}

In [54]:
history = model.fit(
    X_train_inputs,
    train_labels,
    validation_data=(
        X_val_inputs,
       val_labels,
    ),
    batch_size=32,
    epochs=5,
)

Epoch 1/5
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1978s[0m 13s/step - IE_output_accuracy: 0.6250 - IE_output_loss: 0.6409 - JP_output_accuracy: 0.5960 - JP_output_loss: 0.7135 - NS_output_accuracy: 0.8656 - NS_output_loss: 0.4235 - TF_output_accuracy: 0.4930 - TF_output_loss: 0.7455 - loss: 2.5235 - val_IE_output_accuracy: 0.7615 - val_IE_output_loss: 0.5512 - val_JP_output_accuracy: 0.5833 - val_JP_output_loss: 0.6885 - val_NS_output_accuracy: 0.8556 - val_NS_output_loss: 0.4193 - val_TF_output_accuracy: 0.5372 - val_TF_output_loss: 0.6910 - val_loss: 2.3493
Epoch 2/5
[1m153/153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2111s[0m 14s/step - IE_output_accuracy: 0.7585 - IE_output_loss: 0.5623 - JP_output_accuracy: 0.5684 - JP_output_loss: 0.6992 - NS_output_accuracy: 0.8654 - NS_output_loss: 0.4107 - TF_output_accuracy: 0.5172 - TF_output_loss: 0.7144 - loss: 2.3866 - val_IE_output_accuracy: 0.7615 - val_IE_output_loss: 0.5483 - val_JP_output_accuracy: 0.5833 

In [58]:
test_labels = {'IE_output': y_test.iloc[:, 0],
               'NS_output': y_test.iloc[:, 1],
               'TF_output': y_test.iloc[:, 2],
               'JP_output': y_test.iloc[:, 3]}


model.evaluate(X_test_inputs, test_labels)

[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m652s[0m 10s/step - IE_output_accuracy: 0.7828 - IE_output_loss: 0.5216 - JP_output_accuracy: 0.6201 - JP_output_loss: 0.6641 - NS_output_accuracy: 0.8576 - NS_output_loss: 0.4133 - TF_output_accuracy: 0.5426 - TF_output_loss: 0.6868 - loss: 2.2858


[2.288877010345459,
 0.5227404236793518,
 0.4101163446903229,
 0.6862896680831909,
 0.6693678498268127,
 0.7819271683692932,
 0.6058091521263123,
 0.8584601283073425,
 0.5541724562644958]

In [59]:
model.save("bert_model.keras")