# Briggs GO
MBTI Test using AI/ML


In [1]:
"""
Using Modin might be upto 10x faster than pandas, but it is not stable yet.
"""
use_modin = False
if use_modin:
    from distributed import Client
    client = Client()
    import modin.pandas as pd
else:
    import pandas as pd

# Classifiers
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB


import numpy as np
import matplotlib.pyplot as plt

## Loading Data

In [2]:
df = pd.read_csv("dataset/mbti_1.csv")

In [3]:
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


## Preprocessing Data

### Data Before Preprocessing

In [4]:
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


### Removing URL

In [5]:
import re

In [6]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

In [7]:
df['posts'] = df['posts'].apply(lambda x: remove_url(x))

In [8]:
df.head()

Unnamed: 0,type,posts
0,INFJ,' and intj moments sportscenter not top ten...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,"'Good one _____ course, to which I say I k..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


## Data Normalization

In [9]:
def remove_punctuation(text):
    table = re.sub(r'[^\w\s]', ' ', text)
    return table

def remove_underscore(text):
    table = re.sub(r'_', ' ', text)
    return table

def remove_extra_space(text):
    # remove extra space
    return re.sub(r'\s+', ' ', text)

In [10]:
df['posts'] = df['posts'].apply(lambda x: remove_punctuation(x))
df['posts'] = df['posts'].apply(lambda x: remove_underscore(x))
df['posts'] = df['posts'].apply(lambda x: remove_extra_space(x))
df.head()

Unnamed: 0,type,posts
0,INFJ,and intj moments sportscenter not top ten pla...
1,ENTP,I m finding the lack of me in these posts ver...
2,INTP,Good one course to which I say I know that s ...
3,INTJ,Dear INTP I enjoyed our conversation the othe...
4,ENTJ,You re fired That s another silly misconcepti...


### Tokenization

In [11]:
# from nltk.tokenize import word_tokenize

In [12]:
# def tokenization(text):
#     return word_tokenize(text)

In [13]:
# df['posts'] = df['posts'].apply(lambda x: tokenization(x))

In [14]:
df

Unnamed: 0,type,posts
0,INFJ,and intj moments sportscenter not top ten pla...
1,ENTP,I m finding the lack of me in these posts ver...
2,INTP,Good one course to which I say I know that s ...
3,INTJ,Dear INTP I enjoyed our conversation the othe...
4,ENTJ,You re fired That s another silly misconcepti...
...,...,...
8670,ISFP,just because I always think of cats as Fi dom...
8671,ENFP,So if this thread already exists someplace el...
8672,INTP,So many questions when i do these things I wo...
8673,INFP,I am very conflicted right now when it comes ...


## Split data into train and test

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X = df['posts']
y = df['type']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Vectorization

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

In [20]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [21]:
x_train = X_train.toarray()
x_test = X_test.toarray()

In [22]:
x_train

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.02961654, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

### Label Encoding

In [23]:
from sklearn.preprocessing import LabelEncoder

In [24]:
le = LabelEncoder()

In [25]:
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [26]:
y_test

array([11, 10, 11, ...,  8, 14, 11])

## Model Building

In [27]:
model_accuracy = {}

### Logistic Regression

In [29]:
lr = LogisticRegression()

In [30]:
lr.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
lr_pred = lr.predict(x_test)

In [32]:
lr_acc = lr.score(x_test, y_test)

In [33]:
model_accuracy['Logistic Regression'] = lr_acc

In [34]:
lr_acc

0.6530259365994237

In [35]:
from sklearn.metrics import classification_report

In [36]:
cr = classification_report(y_test, lr_pred, target_names=le.classes_, zero_division=0)

In [37]:
print(f"Classification Report For Logistic Regression:\n{cr}")

Classification Report For Logistic Regression:
              precision    recall  f1-score   support

        ENFJ       0.45      0.12      0.19        41
        ENFP       0.72      0.58      0.65       125
        ENTJ       0.70      0.36      0.48        44
        ENTP       0.72      0.59      0.65       135
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00         8
        ESTJ       0.00      0.00      0.00         7
        ESTP       1.00      0.13      0.24        15
        INFJ       0.64      0.69      0.67       288
        INFP       0.61      0.85      0.71       370
        INTJ       0.60      0.72      0.66       193
        INTP       0.68      0.81      0.74       293
        ISFJ       1.00      0.29      0.45        45
        ISFP       0.74      0.26      0.39        53
        ISTJ       0.69      0.25      0.37        44
        ISTP       0.74      0.46      0.57        67

    accuracy                     

### Random Forest

In [39]:
rf = RandomForestClassifier()

In [40]:
rf.fit(x_train, y_train)

In [41]:
rf_pred = rf.predict(x_test)

In [42]:
rf_acc = rf.score(x_test, y_test)

In [43]:
model_accuracy['Random Forest'] = rf_acc

In [44]:
rf_acc

0.5510086455331412

In [45]:
cr = classification_report(y_test, rf_pred, target_names=le.classes_, zero_division=0)

In [46]:
print(f"Classification Report For Random Forest:\n{cr}")

Classification Report For Random Forest:
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        41
        ENFP       0.68      0.36      0.47       125
        ENTJ       0.50      0.02      0.04        44
        ENTP       0.71      0.36      0.48       135
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00         8
        ESTJ       0.00      0.00      0.00         7
        ESTP       0.00      0.00      0.00        15
        INFJ       0.53      0.68      0.60       288
        INFP       0.47      0.86      0.60       370
        INTJ       0.57      0.56      0.57       193
        INTP       0.65      0.71      0.68       293
        ISFJ       0.00      0.00      0.00        45
        ISFP       0.67      0.04      0.07        53
        ISTJ       0.67      0.09      0.16        44
        ISTP       0.88      0.34      0.49        67

    accuracy                           

### XGBoost

In [48]:
xgb = XGBClassifier()

In [49]:
xgb.fit(X_train,y_train)

In [50]:
print(classification_report(y_test, xgb.predict(X_test), zero_division=0))

              precision    recall  f1-score   support

           0       0.50      0.20      0.28        41
           1       0.65      0.58      0.61       125
           2       0.54      0.43      0.48        44
           3       0.60      0.61      0.61       135
           4       0.33      0.14      0.20         7
           5       0.00      0.00      0.00         8
           6       0.60      0.43      0.50         7
           7       0.71      0.33      0.45        15
           8       0.64      0.64      0.64       288
           9       0.66      0.81      0.73       370
          10       0.62      0.68      0.65       193
          11       0.72      0.76      0.74       293
          12       0.78      0.47      0.58        45
          13       0.69      0.51      0.59        53
          14       0.72      0.52      0.61        44
          15       0.69      0.57      0.62        67

    accuracy                           0.66      1735
   macro avg       0.59   

In [51]:
xgb_acc = xgb.score(X_test, y_test)
model_accuracy['XGBoost'] = xgb_acc

### Naive Bayes

In [53]:
nb = MultinomialNB()

In [54]:
nb.fit(x_train, y_train)

In [55]:
nb_pred = nb.predict(x_test)

In [56]:
nb_acc = nb.score(x_test, y_test)

In [57]:
model_accuracy['Naive Bayes'] = nb_acc

### SVM

In [58]:
from sklearn.svm import SVC

In [59]:
svm = SVC()

In [60]:
svm.fit(x_train, y_train)

In [61]:
svm_pred = svm.predict(x_test)

In [62]:
svm_acc = svm.score(x_test, y_test)

In [63]:
model_accuracy['SVM'] = svm_acc

In [64]:
svm_acc

0.652449567723343

### CatBoost

In [71]:
cb = CatBoostClassifier(task_type='GPU')

In [72]:
cb.fit(x_train, y_train)

Learning rate set to 0.096365
0:	learn: 2.4250695	total: 273ms	remaining: 4m 32s
1:	learn: 2.2620977	total: 498ms	remaining: 4m 8s
2:	learn: 2.1428622	total: 731ms	remaining: 4m 3s
3:	learn: 2.0556491	total: 960ms	remaining: 3m 58s
4:	learn: 1.9809427	total: 1.19s	remaining: 3m 56s
5:	learn: 1.9215079	total: 1.42s	remaining: 3m 54s
6:	learn: 1.8562860	total: 1.65s	remaining: 3m 53s
7:	learn: 1.8130775	total: 1.87s	remaining: 3m 52s
8:	learn: 1.7685299	total: 2.12s	remaining: 3m 53s
9:	learn: 1.7221564	total: 2.36s	remaining: 3m 54s
10:	learn: 1.6833019	total: 2.6s	remaining: 3m 54s
11:	learn: 1.6453921	total: 2.84s	remaining: 3m 53s
12:	learn: 1.6148105	total: 3.06s	remaining: 3m 52s
13:	learn: 1.5862074	total: 3.28s	remaining: 3m 50s
14:	learn: 1.5535519	total: 3.48s	remaining: 3m 48s
15:	learn: 1.5305333	total: 3.68s	remaining: 3m 46s
16:	learn: 1.5115507	total: 3.88s	remaining: 3m 44s
17:	learn: 1.4848782	total: 4.13s	remaining: 3m 45s
18:	learn: 1.4623911	total: 4.34s	remaining: 3m

<catboost.core.CatBoostClassifier at 0x13a22037490>

In [73]:
cb_pred = cb.predict(x_test)

In [74]:
cb_acc = cb.score(x_test, y_test)

In [75]:
model_accuracy['CatBoost'] = cb_acc

In [76]:
cb_acc

0.6783861671469741

## Model Comparison

In [77]:
model_accuracy

{'Logistic Regression': 0.6530259365994237,
 'Random Forest': 0.5510086455331412,
 'XGBoost': 0.6570605187319885,
 'Naive Bayes': 0.37694524495677234,
 'SVM': 0.652449567723343,
 'CatBoost': 0.6783861671469741}

In [82]:
model_accuracy = pd.DataFrame(model_accuracy.items(), columns=['Model', 'Accuracy'])

In [85]:
model_accuracy.sort_values(by='Accuracy',ascending=False,ignore_index=True).style.background_gradient(cmap='Blues')

Unnamed: 0,Model,Accuracy
0,CatBoost,0.678386
1,XGBoost,0.657061
2,Logistic Regression,0.653026
3,SVM,0.65245
4,Random Forest,0.551009
5,Naive Bayes,0.376945
