In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

import nltk
from nltk.stem import PorterStemmer
ps = nltk.PorterStemmer()
from nltk.stem import WordNetLemmatizer 
lm = WordNetLemmatizer()
stopwords =nltk.corpus.stopwords.words('english')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import re
import string

import joblib


In [2]:
# import csv
full_data = pd.read_csv("mbti_1.csv")

# selecting random percentage of rows because of memory issues
data = full_data.sample(frac = 0.1)
data.columns = ['type', 'posts']

In [3]:
# import string
# #calculating the average post length
# data['avg_post_len'] = data['posts'].apply(lambda x: (len(x) - x.count(" "))/50)
# data

In [4]:
# #calculating the total post length
# data['tot_post_len'] = data['posts'].apply(lambda x: len(x) - x.count(" "))
# data

In [5]:
# # calculating the punctuation percentage
# def punct_count(post):
#     count = sum([1 for char in post if char in string.punctuation])
#     return round(count/(len(post) - post.count(" ")), 3)*100

# data['punct_%'] = data['posts'].apply(lambda x: punct_count(x))
# data

In [6]:
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,type,posts
0,INFP,'I'm itching so hard for that 300 What's your ...
1,ISFJ,Thank you! I am now quite sure that i am not x...
2,INFP,"'To be honest, I just stopped looking for rela..."
3,INFJ,'(Animated Music Video) Share your favorite or...
4,INTJ,'Yes! I luckily managed to get home from uni b...
...,...,...
863,INTJ,"'The naked women chase was the best, I think, ..."
864,INFJ,'It is definitely interesting to see the diffe...
865,INFJ,'Yea I should have adressed I just meant in pu...
866,ENFP,'a space whale - The only reason I worded it l...


In [7]:
def clean_posts(post):
    post = "".join([word.lower()for word in post if word not in string.punctuation])
    tokens = re.split('\W+', post)
    post = [lm.lemmatize(word) for word in tokens if word not in stopwords]
    return post

#data['cleaned_posts'] = data['posts'].apply(lambda x: clean_posts(x))

In [8]:
# removing piping
data['posts']= data['posts'].str.replace('|',' ')

# removing '
data['posts']= data['posts'].str.replace("'",'')

# removing url's from posts
data['posts'] = data['posts'].str.replace('http\S+|www.\S+', '', case=False)

# change case to lower
data['posts'] = data['posts'].str.lower()


#remove punctuation from posts

def remove_punctuation(text):
    no_punctuation = "".join([char for char in text if char not in string.punctuation])
    return no_punctuation

data['body_text_clean'] = data['posts'].apply(lambda x: remove_punctuation(x))

# pulling types from type column
mbti_types = data['type'].unique()

# types to list instead of array
mbti_list = mbti_types.tolist()

# lowercasing types
mbti_new = [x.lower() for x in mbti_list]

# remove references to mbti type in body_text_clean column

for item in mbti_new:
    data['body_text_clean'] = data['body_text_clean'].str.replace(item , "")
    
# # apply word_tokenize to all records
# from nltk.tokenize import word_tokenize

# data['tokenized'] = data['body_text_clean'].apply(word_tokenize)


# # remove stopwords
# def stopword_removal(text):
#     stop_words = [item for item in text if item not in stopwords]
#     return stop_words

# data['stopwords'] = data['tokenized'].apply(lambda x: stopword_removal(x))

def lemma_words(lemma):
    lemmatize = [lm.lemmatize(word) for word in lemma]
    return lemmatize

data['lemmatized'] = data['body_text_clean'].apply(lambda x: lemma_words(x))



In [9]:
data.head()

Unnamed: 0,type,posts,body_text_clean,lemmatized
0,INFP,im itching so hard for that 300 whats your pro...,im itching so hard for that 300 whats your pro...,"[i, m, , i, t, c, h, i, n, g, , s, o, , h, ..."
1,ISFJ,thank you! i am now quite sure that i am not x...,thank you i am now quite sure that i am not xn...,"[t, h, a, n, k, , y, o, u, , i, , a, m, , ..."
2,INFP,"to be honest, i just stopped looking for relat...",to be honest i just stopped looking for relati...,"[t, o, , b, e, , h, o, n, e, s, t, , i, , ..."
3,INFJ,(animated music video) share your favorite or ...,animated music video share your favorite or ju...,"[a, n, i, m, a, t, e, d, , m, u, s, i, c, , ..."
4,INTJ,yes! i luckily managed to get home from uni be...,yes i luckily managed to get home from uni bef...,"[y, e, s, , i, , l, u, c, k, i, l, y, , m, ..."


In [None]:
data.head()

In [None]:
from sklearn.model_selection import train_test_split
X = data['posts'].values
y = data['type'].values

X_train, X_test, y_train, y_test = train_test_split(X, y) 


In [None]:
X_train

In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
        ('vect',CountVectorizer(analyzer = 'word')),
        ('clf', RandomForestClassifier())
        
])

pipe_parms = [{
    'clf__n_estimators' : [600,800],
    'clf__max_depth' : [None,200]
}]

pipe.fit(X_train, y_train)

In [None]:
# Predict training data
y_train_pred = pipe.predict(X_train)
print(f"Predictions on training data: {y_train_pred}")


In [None]:
# Predict test data
y_test_pred = pipe.predict(X_test)
print(f"Predictions on test data: {y_test_pred}")

In [None]:
pipe.score(X_test, y_test)

In [None]:
gs = GridSearchCV(pipe, param_grid= pipe_parms, cv=5)

In [None]:
gs.fit(X_train, y_train)

In [None]:
print(gs.best_params_)

In [None]:
print(gs.best_score_)

In [None]:
import pickle


In [None]:
pickle.dumps(clean_posts)

In [None]:
# yo uname the file here
with open('mbti_model.pickle', 'wb') as f:
     pickle.dump(pipe, f)

In [None]:
# # yo uname the file here
# with open('mbti_model.pickle', 'wb') as f:
#     pickle.dump(pipe, f)

In [None]:
# with open('picklefile.pickle', 'rb') as f:
#     loaded_vars = pickle.load(f)

In [None]:
# loaded_vars

In [None]:
# gs.predict(X_count_feature)

In [None]:
# pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()