In this additional Notebook, explore our accuracy with BERT.

The platform used is anaconda's jupyter Notebook platform and the environment is self-configured.The detailed configuration information can be found in the final report.

# **Data Pipeline**

In [None]:
!pip install tensorflow
!pip install transformers

In [None]:
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import regex as re
import transformers
from keras import backend as K
import plotly.express as px


data=pd.read_csv('DataBase_MBTI.csv')
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [None]:
use_tpu = False
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.MirroredStrategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
REPLICAS:  1


In [None]:
data['type'].value_counts()

INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: type, dtype: int64

In [None]:
def clean_text(data):
    data_length=[]
    lemmatizer=WordNetLemmatizer()
    cleaned_text=[]
    for sentence in tqdm(data.posts):
        sentence=sentence.lower()
        
        #removing links from text data
        sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
    
        #removing other symbols
        sentence=re.sub('[^0-9a-z]',' ',sentence)
    
        
        data_length.append(len(sentence.split()))
        cleaned_text.append(sentence)
    return cleaned_text

In [None]:
data.posts = clean_text(data)
data

  0%|          | 0/8675 [00:00<?, ?it/s]

Unnamed: 0,type,posts
0,INFJ,and intj moments sportscenter not top t...
1,ENTP,i m finding the lack of me in these posts ver...
2,INTP,good one course to which i say i ...
3,INTJ,dear intp i enjoyed our conversation the o...
4,ENTJ,you re fired that s another silly misconce...
...,...,...
8670,ISFP,just because i always think of cats as fi d...
8671,ENFP,so if this thread already exists someplace ...
8672,INTP,so many questions when i do these things i ...
8673,INFP,i am very conflicted right now when it comes ...


**Initialize BERT Tokenizer and attention masks******

In [None]:
#Split dataset
from sklearn.model_selection import train_test_split

posts = data['posts'].values
labels =  data['type'].values
train_data, test_data = train_test_split(data, random_state=0, test_size=0.2)

train_size = len(train_data)
test_size = len(test_data)
train_size, test_size

(6940, 1735)

In [None]:
#Initialize Bert tokenizer and masks
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

bert_model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
MAX_LEN = 1800

def tokenize_sentences(sentences, tokenizer, max_seq_len = 1800):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,                 
                            add_special_tokens = True, 
                            max_length = max_seq_len, 
                    )
        
        tokenized_sentences.append(tokenized_sentence)
        
    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)

train_input_ids = tokenize_sentences(train_data['posts'], tokenizer, MAX_LEN)
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
train_attention_masks = create_attention_masks(train_input_ids)

test_input_ids = tokenize_sentences(test_data['posts'], tokenizer, MAX_LEN)
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
test_attention_masks = create_attention_masks(test_input_ids)

  0%|          | 0/6940 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/1735 [00:00<?, ?it/s]

In [None]:
BATCH_SIZE=32 
NR_EPOCHS=20

# BERT Model

In [None]:
#Define f1 functions for evaluation
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

In [None]:
def create_model(): 
    input_word_ids = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32,
                                           name="input_word_ids")
    bert_layer = transformers.TFBertModel.from_pretrained('bert-large-uncased')
    bert_outputs = bert_layer(input_word_ids)[0]
    pred = tf.keras.layers.Dense(16, activation='softmax')(bert_outputs[:,0,:])
    
    model = tf.keras.models.Model(inputs=input_word_ids, outputs=pred)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.00002), metrics=['accuracy', f1_m, precision_m, recall_m])
    return model

In [None]:
use_tpu = False
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()
    
model.summary()

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: '<' not supported between instances of 'str' and 'Literal'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: '<' not supported between instances of 'str' and 'Literal'
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 1800)]            0         
_________________________________________________________________
tf_bert_model (TFBertModel)  TFBaseModelOutputWithPool 335141888 
_________________________________________________________________

In [None]:
types = np.unique(data.type.values)

def get_type_index(string):
    return list(types).index(string)

In [None]:
train_data['type_index'] = data['type'].apply(get_type_index)
train_data

Unnamed: 0,type,posts,type_index
984,INTP,phrases i never want to hear again a k a if yo...,11
6706,INTP,yeah you say you primarily value people who ...,11
143,ENFP,63915 i got my hair cut d some people say t...,1
4844,INFP,as far as i live in this world i ve never bee...,9
4388,ISFP,meh it s overplayed ya but still its good n...,13
...,...,...,...
4373,INFP,hey it seems like you have a great foundatio...,9
7891,INFJ,dear istj mother when i started my very fi...,8
4859,INTP,oh entjs how can you be scary and exciting a...,11
3264,ENFJ,hi entp and welcome to the forum wink f...,0


In [None]:
one_hot_labels = tf.keras.utils.to_categorical(train_data.type_index.values, num_classes=16)

In [None]:

model.fit(np.array(train_input_ids), one_hot_labels, verbose = 1, epochs = NR_EPOCHS, batch_size = BATCH_SIZE,  callbacks = [tf.keras.callbacks.EarlyStopping(patience = 5)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fc437d47150>

**Run test and evaluate accuracy**

In [None]:
test_data['type_index'] = data['type'].apply(get_type_index)

In [None]:
test_labels = tf.keras.utils.to_categorical(test_data.type_index.values, num_classes=16)

In [None]:
model.evaluate(np.array(test_input_ids), test_labels)



[1.8708125352859497,
 0.6945245265960693,
 0.6945076584815979,
 0.7115530371665955,
 0.6818181872367859]

In [None]:
cols = data['type'].unique()
cols = cols.tolist()

colnames = ['sentence']
colnames = colnames+cols


In [None]:

df_predict = pd.DataFrame(columns = colnames)
sentence = "Time to debate on it. Strike at the weakest point and make others cry with facts"

df_predict.loc[0, 'sentence'] = sentence

In [None]:
sentence_inputs = tokenize_sentences(df_predict['sentence'], tokenizer, MAX_LEN)
sentence_inputs = pad_sequences(sentence_inputs, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
prediction = model.predict(np.array(sentence_inputs))
df_predict.loc[0, cols] = prediction

df_predict

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,sentence,INFJ,ENTP,INTP,INTJ,ENTJ,ENFJ,INFP,ENFP,ISFP,ISTP,ISFJ,ISTJ,ESTP,ESFP,ESTJ,ESFJ
0,Time to debate on it. Strike at the weakest po...,0.014544,0.019483,0.000535,0.02171,0.000376,0.000605,0.00058,0.000167,0.044402,0.041866,0.010097,0.816447,0.00018,0.002427,0.001249,0.025331


BERT has a good performance, but it has to be mentioned that BERT has relatively high hardware requirements and we may use this algorithm when building our website, but it does not perform very well, only well, so it is possible that we will not use it