## Myers–Briggs personality type predictor

Using the dataset from: [https://www.kaggle.com/datasnaek/mbti-type](https://www.kaggle.com/datasnaek/mbti-type)

### Importing the necessary modules

In [1]:
import sys
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.externals import joblib

### Creating utilities

In [2]:
def calc_distance(sentence1, sentence2):
    """
    Calculating the euclidean distance between
    two preprocessed sentences.
    """
    s1_normalized = sentence1 / np.linalg.norm(sentence1)
    s2_normalized = sentence2 / np.linalg.norm(sentence2)
    return np.linalg.norm(s1_normalized - s2_normalized)


class StemmedTfidfVectorizer(TfidfVectorizer):
    
    def build_analyzer(self, stemmer=None):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        
        if stemmer is None:
            stemmer = SnowballStemmer('english')
        
        return lambda text: (stemmer.stem(w) for w in analyzer(text))

### Loading the dataset

In [3]:
N = 1000  # number of elements to use from the dataset, because of high ram usage  
df = shuffle( pd.read_csv('../data/mbti-myers-briggs-personality-types.csv') )[:N]

### Preprocessing

In [4]:
df.head()

Unnamed: 0,type,posts
5536,INTJ,'Young enough to still find my laughter lines ...
6656,INFP,'https://www.youtube.com/watch?v=LMdrk-vCACI||...
4375,INTJ,"'It was too easy when I was younger, so I stop..."
4272,INFP,'I would rather have teeth come out of my eyes...
7774,INFP,'something in the way- nirvana https://www.y...


In [5]:
type_encoder = OneHotEncoder()
y_encoded = type_encoder.fit_transform( np.array([df['type'].values]).T )
y = y_encoded.toarray()
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [6]:
from_nth_column = 7  # get values only before this column
new_X = df['posts'].str.split(pat='\|\|\|', expand=True, n=from_nth_column).values
new_X[:1][0][0]

"'I know what you mean. The way I'm acting at the moment is the result of doing what you said and it serving no benefit whatsoever. Well, I tried my hardest anyway.   I have no problem showing..."

In [10]:
vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')
X_encoded = vectorizer.fit_transform(df['posts'].values)

In [13]:
X = X_encoded.toarray()
print(X[0], X.shape)

[0. 0. 0. ... 0. 0. 0.] (1000, 30109)


In [17]:
print(y.shape, X.shape)
print(y[0], X[0])
print(type(y), type(X))

(1000, 16) (1000, 30109)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.] [0. 0. 0. ... 0. 0. 0.]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [22]:
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

nn = MLPClassifier(learning_rate_init=.01, hidden_layer_sizes=(500,), max_iter=100)
nn.fit(X_train, y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(500,), learning_rate='constant',
       learning_rate_init=0.01, max_iter=100, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [23]:
print('Error on test', mean_squared_error(y_test, nn.predict(X_test)))
print('Error on train', mean_squared_error(y_train, nn.predict(X_train)))

Error on test 0.061875
Error on train 0.0


In [26]:
joblib.dump(nn, '../trained/nn.pkl')

['../trained/nn.pkl']