## Resources
[spaCy][https://www.analyticsvidhya.com/blog/2017/04/natural-language-processing-made-easy-using-spacy-%E2%80%8Bin-python/]

In [73]:
import re
import psycopg2 as pg
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import string
punctuations = string.punctuation

from spacy.lang.en import English

In [12]:
def pg_fetch_all(conn, script):
    cursor = conn.cursor()
    try:
        cursor.execute(script)
        conn.commit()
        contents = cursor.fetchall()
    except (Exception, pg.DatabaseError) as error:
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        contents = 1
    cursor.close()

    return contents

In [17]:
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

parser = English()

#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic utility function to clean the text 
def clean_text(text):     
    return text.strip().lower()

## Connect to the database and load all of the posts.

In [18]:
# Postgres info to connect
connection_args = {
    'host': 'localhost',  # We are connecting to our _local_ version of psql
    'dbname': 'myersbriggs',    # DB that we are connecting to
    'port': 5432          # port we opened on AWS
}

connection = pg.connect(**connection_args)  # What is that "**" there??

In [19]:
postgreSQL_select_Query = "SELECT * FROM raw_data;"

myers_briggs = pg_fetch_all(connection, postgreSQL_select_Query)

## For loading posts by personality type from database.

In [None]:
# get_posts_by_type = "SELECT posts FROM raw_data WHERE type = 'ENTJ';"

# posts_by_type = pg_fetch_all(connection, get_posts_by_type)

In [20]:
mb_df = pd.DataFrame(myers_briggs)

post_list = [re.split('\|\|\|+', post) for post in mb_df[1]]
post_df = pd.DataFrame(post_list)
post_df.insert(loc=0, column='type', value=mb_df[0])

In [21]:
mb_df = pd.DataFrame(myers_briggs, columns=['type', 'posts'])
types = sorted(mb_df['type'].unique())

posts_by_type = {typ: mb_df[mb_df['type'] == typ] for typ in types}

In [22]:
connection.close()

In [74]:
#Create spacy tokenizer that parses a sentence and generates tokens
#these can also be replaced by word vectors 
def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]

    return tokens

#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,1))
classifier = LinearSVC()

In [None]:
# reformat data into [(text, type)]

In [75]:
X_train, X_test, y_train, y_test = train_test_split(mb_df['posts'], mb_df['type'])

train = []
test = []
for i in range(len(X_train)):
    train.append((X_train.iloc[i], y_train.iloc[i]))

for i in range(len(X_test)):
    test.append((X_test.iloc[i], y_test.iloc[i]))

In [78]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

# Create model and measure accuracy
pipe.fit([x[0] for x in train], [x[1] for x in train]) 
pred_data = pipe.predict([x[0] for x in test]) 
for (sample, pred) in zip(test, pred_data):
    print(sample, pred)
print("Accuracy:", accuracy_score([x[1] for x in test], pred_data))

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [63]:
print("Precision:", precision_score([x[1] for x in test], pred_data, average='micro'))

Precision: 0.5790686952512679


In [64]:
print("Precision:", precision_score([x[1] for x in test], pred_data, average=None))

Precision: [0.52631579 0.58490566 0.41304348 0.4875     0.66666667 0.
 1.         0.88888889 0.59102902 0.64259928 0.56630824 0.57065217
 0.66666667 0.46938776 0.44117647 0.53424658]


In [68]:
cr = classification_report([x[1] for x in test], pred_data)

In [71]:
cr.split('\n')

['              precision    recall  f1-score   support',
 '',
 '        ENFJ       0.53      0.23      0.32        43',
 '        ENFP       0.58      0.53      0.56       176',
 '        ENTJ       0.41      0.37      0.39        51',
 '        ENTP       0.49      0.47      0.48       165',
 '        ESFJ       0.67      0.33      0.44         6',
 '        ESFP       0.00      0.00      0.00         4',
 '        ESTJ       1.00      0.08      0.14        13',
 '        ESTP       0.89      0.26      0.40        31',
 '        INFJ       0.59      0.63      0.61       354',
 '        INFP       0.64      0.75      0.69       473',
 '        INTJ       0.57      0.59      0.58       266',
 '        INTP       0.57      0.63      0.60       333',
 '        ISFJ       0.67      0.40      0.50        50',
 '        ISFP       0.47      0.32      0.38        72',
 '        ISTJ       0.44      0.33      0.38        45',
 '        ISTP       0.53      0.45      0.49        87',
 '',
 '  