## Myers–Briggs personality type predictor

Using the dataset from: [https://www.kaggle.com/datasnaek/mbti-type](https://www.kaggle.com/datasnaek/mbti-type)

### Importing the necessary modules

In [1]:
import sys
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.externals import joblib

### Creating utilities

In [2]:
def calc_distance(sentence1, sentence2):
    """
    Calculating the euclidean distance between
    two preprocessed sentences.
    """
    s1_normalized = sentence1 / np.linalg.norm(sentence1)
    s2_normalized = sentence2 / np.linalg.norm(sentence2)
    return np.linalg.norm(s1_normalized - s2_normalized)


class StemmedTfidfVectorizer(TfidfVectorizer):
    
    def build_analyzer(self, stemmer=None):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        
        if stemmer is None:
            stemmer = SnowballStemmer('english')
        
        return lambda text: (stemmer.stem(w) for w in analyzer(text))

### Loading the dataset

In [3]:
N = 1000  # number of elements to use from the dataset, because of high ram usage  
df = shuffle( pd.read_csv('../data/mbti-myers-briggs-personality-types.csv') )[:N]

### Preprocessing

In [4]:
df.head()

Unnamed: 0,type,posts
8518,INTJ,'I know what you mean. The way I'm acting at t...
6211,ISTP,"'Ask questions, don't make statements. The big..."
5596,INFP,"'I dream most nights, or at least that's how I..."
1459,INTJ,'http://forums.oce.leagueoflegends.com/board/a...
5736,ISTP,"'I second this idea, you could also add sleepi..."


In [5]:
type_encoder = OneHotEncoder()
y_encoded = type_encoder.fit_transform( np.array([df['type'].values]).T )
y = y_encoded.toarray()
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [6]:
vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')

from_nth_column = 7  # get values only before this column
new_X = df['posts'].str.split(pat='\|\|\|', expand=True, n=from_nth_column).values
new_X[:1][0][0]

"'I know what you mean. The way I'm acting at the moment is the result of doing what you said and it serving no benefit whatsoever. Well, I tried my hardest anyway.   I have no problem showing..."