In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import classification_report
import pickle
import os.path
import plotly.graph_objs as go


DATASET

Content :

~106K records of preprocessed posts and their authors' personality types.

Posts are equal-sized: 500 words per sample 

About the Dataset :

Posts are preprocessed texts:

- No punctuations, stopwords, URLs

- Lemmatization

- Reconstruct samples to be equal-sized chunks (500 words per sample)

Personality types are 16 unique values

In [39]:
#Reading the dataset
df = pd.read_csv('C:\\Users\\namit\\Downloads\\MBTI 500.csv')
df.head()

Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably p hd low except wew lad video p min...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ


In [40]:
df['posts'][0]

'know intj tool use interaction people excuse antisocial truly enlighten mastermind know would count pet peeze something time matter people either whether group people mall never see best friend sit outside conversation jsut listen want interject sit formulate say wait inject argument thought find fascinate sit watch people talk people fascinate sit class watch different people find intrigue dad intj u stand look like line safeway watch people home talk people like think military job people voluntarily go job important show deference endanger live glorify way civilian think pretty ignorant general think military necessary defense mechanism political tactic feel like u specifically invest much money could put money education whatnot though personally sound budget aernative really comment one way base two politician eye year ago come name somewhat important kinda role model nowadays pick keep score individual level mean little vary accord number condition day may score high others low sw

In [41]:
df['type'].unique()

array(['INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP', 'ENFJ', 'ENFP',
       'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP', 'INFJ', 'INFP'],
      dtype=object)

In [42]:
df.isnull().sum()

posts    0
type     0
dtype: int64

In [43]:
#Counting the number of times each type appears in the dataset
df_bar_chart=df.groupby('type').count()


trace1 = go.Bar(x=df_bar_chart.index, y=df_bar_chart['posts'])

data = [trace1]
layout = go.Layout(title='MBTI # Classified Posts per Type')

fig = go.Figure(data=data, layout=layout)

fig.show()

In [44]:
recreate_model=False

In [45]:
filename = 'mbti_svm_v2.sav'

In [46]:
#Checking if the model is already created
if not os.path.isfile(filename):
    recreate_model=True

In [47]:
#Splitting the dataset into 80% training dataset and 20% test dataset
X = df['posts'] # features
y = df['type']  # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
if recreate_model:    
    # Pipelining the vectorizer and the classifier
    text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
    text_clf.fit(X_train, y_train)
    
    # Saving the model to disk
    pickle.dump(text_clf, open(filename, 'wb')) 
else:
    # loading the model from disk
    text_clf = pickle.load(open(filename, 'rb'))

In [49]:
#Predicting the results for the 20% of testing dataset
predictions = text_clf.predict(X_test)

In [50]:
#Printing the classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        ENFJ       0.84      0.58      0.69       319
        ENFP       0.82      0.78      0.80      1249
        ENTJ       0.90      0.80      0.84       577
        ENTP       0.86      0.83      0.84      2324
        ESFJ       0.83      0.45      0.59        33
        ESFP       0.88      0.48      0.62        75
        ESTJ       0.90      0.84      0.87       105
        ESTP       0.95      0.90      0.92       398
        INFJ       0.81      0.84      0.83      2954
        INFP       0.80      0.82      0.81      2391
        INTJ       0.83      0.87      0.85      4531
        INTP       0.84      0.87      0.86      5033
        ISFJ       0.80      0.61      0.69       132
        ISFP       0.81      0.60      0.69       161
        ISTJ       0.86      0.68      0.76       253
        ISTP       0.89      0.79      0.84       679

    accuracy                           0.84     21214
   macro avg       0.85   

In [51]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),2)}")

Overall accuracy of the model: 0.84


In [52]:
#Test string
prediction = text_clf.predict(['I am a big fan of the MBTI'])
print(prediction)

['ESTP']
