## GPU Configuration

In [2]:
import tensorflow as tf

# Enable memory growth for the specified GPU devices
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


## Import the libraries

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from keras import Sequential
from keras.layers import Embedding, SpatialDropout1D, Dense, Dropout, LSTM,Bidirectional, Flatten,Conv1D, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping
import transformers


## Data Read and pre - process

In [4]:
data = pd.read_csv("/notebooks/cleaned_mbti_data.csv")
data.head()

Unnamed: 0,type,posts,cleaned_text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,enfp and intj moments sportscenter not top ten...
1,ENTP,'I'm finding the lack of me in these posts ver...,im finding the lack of me in these posts very ...
2,INTP,'Good one _____ https://www.youtube.com/wat...,good one of course to which i say i know that...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear intp i enjoyed our conversation the other...
4,ENTJ,'You're fired.|||That's another silly misconce...,youre fired thats another silly misconception ...


In [5]:
types = np.unique(data.type.values)

def get_type_index(string):
    return list(types).index(string)

data['type_index'] = data['type'].apply(get_type_index)
data.head()

Unnamed: 0,type,posts,cleaned_text,type_index
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,enfp and intj moments sportscenter not top ten...,8
1,ENTP,'I'm finding the lack of me in these posts ver...,im finding the lack of me in these posts very ...,3
2,INTP,'Good one _____ https://www.youtube.com/wat...,good one of course to which i say i know that...,11
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear intp i enjoyed our conversation the other...,10
4,ENTJ,'You're fired.|||That's another silly misconce...,youre fired thats another silly misconception ...,2


## Split data into train test and val

In [6]:
train, test = train_test_split(data)
train, val = train_test_split(train)

## Tokenization

In [7]:
vocab_size = 10000
trunc_type = "post"
pad_type = "post"
oov_tok = "<OOV>"
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(data.cleaned_text.values)

In [8]:
maxlen = 1500
train_sequences = tokenizer.texts_to_sequences(train.cleaned_text.values)
train_padded = pad_sequences(train_sequences, maxlen = maxlen, truncating = trunc_type, padding = pad_type)

val_sequences = tokenizer.texts_to_sequences(val.cleaned_text.values)
val_padded = pad_sequences(val_sequences, maxlen = maxlen, truncating = trunc_type, padding = pad_type)

In [9]:
train_padded

array([[  19, 3207,    6, ...,    0,    0,    0],
       [   1,   10,   40, ...,   50,   16, 7146],
       [ 202,    2,   75, ...,    0,    0,    0],
       ...,
       [  19, 1426,    9, ...,    0,    0,    0],
       [ 117,  228,   52, ...,   62,   76,  302],
       [   1,   23,   26, ...,    1,    2,  633]], dtype=int32)

## Convert labels to categorical values

In [10]:
one_hot_labels = tf.keras.utils.to_categorical(train.type_index.values, num_classes=16)
val_labels= tf.keras.utils.to_categorical(val.type_index.values, num_classes=16)

## Define the models

LSTM 

In [11]:
def create_lstm_model():
    model = Sequential([
        Embedding(vocab_size, 256, input_length=maxlen),
        SpatialDropout1D(0.2),
        LSTM(100, dropout=0.2, recurrent_dropout=0.2),
        Dense(16, activation="softmax")
    ])
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

Bi - Directional LSTM

In [12]:
def create_bilstm_model():
    model = Sequential([
        Embedding(vocab_size, 256, input_length=maxlen),
        Bidirectional(LSTM(100, return_sequences=True)),
        Dropout(0.3),
        Bidirectional(LSTM(50)),
        Dense(16, activation="softmax")
    ])
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

BERT

In [13]:
def create_bert_model():
    tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased')
    
    train_input_ids = [tokenizer.encode(str(i), max_length = maxlen , pad_to_max_length = True) for i in train.cleaned_text.values]
    val_input_ids = [tokenizer.encode(str(i), max_length = maxlen , pad_to_max_length = True) for i in val.cleaned_text.values]

    input_word_ids = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32,
                                           name="input_word_ids")
    bert_layer = transformers.TFBertModel.from_pretrained('bert-base-uncased')
    bert_outputs = bert_layer(input_word_ids)[0]
    pred = tf.keras.layers.Dense(16, activation='softmax')(bert_outputs[:,0,:])
    
    model = tf.keras.models.Model(inputs=input_word_ids, outputs=pred)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.00001), metrics=['accuracy'])
    
    return model, train_input_ids, val_input_ids

## Model Training

LSTM

In [15]:
lstm_model = create_lstm_model()
lstm_model.fit(train_padded, one_hot_labels, epochs=10, batch_size=64,
                validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.src.callbacks.History at 0x7ff224eae190>

Bi - Directional LSTM

In [16]:
bilstm_model = create_bilstm_model()
bilstm_model.fit(train_padded, one_hot_labels, epochs =20, verbose = 1, 
          validation_data = (val_padded, val_labels),  callbacks = [tf.keras.callbacks.EarlyStopping(patience = 3)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


<keras.src.callbacks.History at 0x7ff1f8245750>

BERT

In [18]:
bert_model, train_input_ids, val_input_ids =  create_bert_model()
bert_model.fit(np.array(train_input_ids), one_hot_labels,validation_data = (np.array(val_input_ids), val_labels),
          verbose = 1, epochs = 20, batch_size = 8,  callbacks = [tf.keras.callbacks.EarlyStopping(patience = 5)])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBer

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


<keras.src.callbacks.History at 0x7fef858cb510>

Save the models

In [19]:
lstm_model.save("models/lstm_model.h5")
bilstm_model.save("models/bilstm_model.h5")
bert_model.save("models/bert_model.h5")

print("All models trained & saved successfully!")

  saving_api.save_model(


All models trained & saved successfully!
