## Resources
[spaCy][https://www.analyticsvidhya.com/blog/2017/04/natural-language-processing-made-easy-using-spacy-%E2%80%8Bin-python/]

In [None]:
import os
import re
import string
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import string
punctuations = string.punctuation

from spacy.lang.en import English

In [None]:
vectorizer_max_features = 1500
chosen_classifier = RandomForestClassifier
train_size = 0.01

In [None]:
from functions import load_data_set
myers_briggs = load_data_set()

In [None]:
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

parser = English()

#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic utility function to clean the text 
def clean_text(text):     
    return text.strip().lower()

## For loading posts by personality type from database.

In [None]:
# get_posts_by_type = "SELECT posts FROM raw_data WHERE type = 'ENTJ';"

# posts_by_type = pg_fetch_all(connection, get_posts_by_type)

In [None]:
mb_df = pd.DataFrame(myers_briggs, columns=['type', 'posts'])
types = sorted(mb_df['type'].unique())

post_list = [re.split('\|\|\|+', post) for post in mb_df['posts']]
post_df = pd.DataFrame(post_list)
post_df.insert(loc=0, column='type', value=mb_df['type'])

posts_by_type = {typ: mb_df[mb_df['type'] == typ] for typ in types}

In [None]:
# start_time = time.time()

# # Convert post_df to a two-column data set.
# def compress_posts(df):
#     alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
#     punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
#     result = []
#     df_length = range(len(df))

#     for i in df_length:
#         for j in range(1, 59):
#             if df.iloc[i][j] != None:
#                 cell = punc_lower(alphanumeric(df.iloc[i][j]))
#                 if cell != None:
#                     result.append([df['type'][i], cell])
    
#     return pd.DataFrame(result)

# vertical_post_df = compress_posts(post_df)
# vertical_post_df.columns = ['type', 'posts']

# elapsed_time = time.time() - start_time
# os.system('say "your program took %s seconds"' % int(elapsed_time))
# print("--- %s seconds ---" % elapsed_time)

In [None]:
vertical_post_df = pd.read_csv('vertical_posts.csv', index_col=0)

In [None]:
from collections import Counter

personality_count = Counter()

for i in mb_df['type']:
    personality_count[i] += 1

personality_types = sorted(personality_count)
post_count = [personality_count[x] for x in personality_types]

In [None]:
plt.bar(personality_types, post_count)
plt.xticks(rotation=90)
plt.xlabel('Personality Type')
plt.ylabel('Number of Posts')

In [None]:
#Create spacy tokenizer that parses a sentence and generates tokens
#these can also be replaced by word vectors 
def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]

    return tokens

#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = CountVectorizer(max_features=vectorizer_max_features, tokenizer=spacy_tokenizer, ngram_range=(1,1))
classifier = chosen_classifier()

In [None]:
# reformat data into [(text, type)]

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(mb_df['posts'], mb_df['type'])
X_train, X_test, y_train, y_test = train_test_split(vertical_post_df['posts'], vertical_post_df['type'], train_size=train_size)

train = []
test = []
for i in range(len(X_train)):
    train.append((X_train.iloc[i], y_train.iloc[i]))

for i in range(len(X_test)):
    test.append((X_test.iloc[i], y_test.iloc[i]))

In [None]:
start_time = time.time()

# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

# Create model and measure accuracy
pipe.fit([x[0] for x in train], [x[1] for x in train]) 
pred_data = pipe.predict([x[0] for x in test]) 
# for (sample, pred) in zip(test, pred_data):
#     print(sample, pred)

elapsed_time = time.time() - start_time
os.system('say "your program took %s seconds"' % int(elapsed_time))
print("--- %s seconds ---" % elapsed_time)

print("Accuracy:", accuracy_score([x[1] for x in test], pred_data))

In [None]:
feature_names = vectorizer.get_feature_names()

In [None]:
print(chosen_classifier, vectorizer_max_features, train_size, elapsed_time)

print("Precision:", precision_score([x[1] for x in test], pred_data, average='micro'))
print("Precision:", precision_score([x[1] for x in test], pred_data, average=None))
cr = classification_report([x[1] for x in test], pred_data)
cr.split('\n')

In [None]:
feature_names