# Personality predictor

[Source](https://www.kaggle.com/datasnaek/mbti-type)

In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
sns.set({'figure.figsize':(10,10,)})

### Try to know our data

In [15]:
df = pd.read_csv('../../../Datasets/mbti-myers-briggs-personality-type-dataset/mbti_1.csv')
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


The ```type``` feature contains our classes and the ```posts``` feature is our independent variable.

In [16]:
print(df.values.shape)
print()
print(df['type'].unique())
print()
print(df['posts'].values.shape)

(8675, 2)

['INFJ' 'ENTP' 'INTP' 'INTJ' 'ENTJ' 'ENFJ' 'INFP' 'ENFP' 'ISFP' 'ISTP'
 'ISFJ' 'ISTJ' 'ESTP' 'ESFP' 'ESTJ' 'ESFJ']

(8675,)


### Preprocessing

Now, let's preprocess our text based data.

In [17]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
y = encoder.fit_transform(df['type'].values[:1000].reshape(-1,1)).todense()
y[:5]

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['posts'].values[:1000]).todense()

Free up some memory

### Building the model

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

model = MLPClassifier(learning_rate_init=.01, hidden_layer_sizes=(100,), max_iter=25, verbose=True)
model.fit(X_train, y_train)

Iteration 1, loss = 17.51905717
Iteration 2, loss = 11.71368446
Iteration 3, loss = 5.72646146
Iteration 4, loss = 5.27152033
Iteration 5, loss = 2.15220956
Iteration 6, loss = 1.15913637
Iteration 7, loss = 0.46728273
Iteration 8, loss = 0.25269141
Iteration 9, loss = 0.14872565
Iteration 10, loss = 0.06892947
Iteration 11, loss = 0.04326724
Iteration 12, loss = 0.03420252
Iteration 13, loss = 0.02486416
Iteration 14, loss = 0.01911958
Iteration 15, loss = 0.01665940
Iteration 16, loss = 0.01506680
Iteration 17, loss = 0.01361335
Iteration 18, loss = 0.01228256
Iteration 19, loss = 0.01142639
Iteration 20, loss = 0.01066347
Iteration 21, loss = 0.01014931
Iteration 22, loss = 0.00961894
Iteration 23, loss = 0.00929720
Iteration 24, loss = 0.00893305
Iteration 25, loss = 0.00864921




MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.01, max_iter=25, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=True, warm_start=False)

In [19]:
from sklearn.metrics import mean_squared_error as mse

print('Train error:', mse(y_train, model.predict(X_train)))
print('Test error:', mse(y_test, model.predict(X_test)))

Train error: 0.0
Test error: 0.0625


In [38]:
sample = vectorizer.transform(df['posts'].iloc[:1])
y_hat = model.predict(sample)

print('The prediction is:',
      encoder.inverse_transform(y_hat)[0][0],
      'and the real class is',
      df['type'].iloc[:1].values[0]
)
print('\n')
print(df['posts'].values[0].split('|||')[-3])

The prediction is: INFJ and the real class is INFJ


I failed a public speaking class a few years ago and I've sort of learned what I could do better were I to be in that position again. A big part of my failure was just overloading myself with too...


In [39]:
joblib.dump(model, '../trained_models/nogit_personality_predictor.pkl')

['../trained_models/nogit_personality_predictor.pkl']