## In this file, I create holdback set with single posts.
## Resources
[NLTK][https://stackabuse.com/text-classification-with-python-and-scikit-learn/]

In [None]:
import nltk
import pickle
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score, precision_score, classification_report, plot_confusion_matrix, roc_auc_score, roc_curve
from sklearn.naive_bayes import MultinomialNB

In [None]:
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [None]:
train_size = 0.8
random_state = 56
vectorizer_max_features = 1500
chosen_classifier = MultinomialNB

In [None]:
from functions import load_data_set, sanitize_posts
myers_briggs = load_data_set()

In [None]:
mb_df = pd.DataFrame(myers_briggs, columns=['type', 'posts'])
types = sorted(mb_df['type'].unique())

post_list = [re.split('\|\|\|+', post) for post in mb_df['posts']]
post_df = pd.DataFrame(post_list)
post_df.insert(loc=0, column='type', value=mb_df['type'])

posts_by_type = {typ: mb_df[mb_df['type'] == typ] for typ in types}

In [None]:
vertical_post_df = pd.read_csv('vertical_posts.csv', index_col=0)

## Split personalities into components.

In [None]:
mb_df['JP'] = mb_df['type'].apply(lambda x: x[3])

JP_df = mb_df[['JP', 'posts']]

In [None]:
J = len(JP_df[JP_df['JP'] == 'J'])
P = len(JP_df[JP_df['JP'] == 'P'])

plt.bar(['J', 'P'], [J, P])

In [None]:
X, y = JP_df['posts'], JP_df['JP']
# X, y = vertical_post_df['posts'], vertical_post_df['type']

## Might want to remove URLs

In [None]:
X_train_val, X_holdback, y_train_val, y_holdback = train_test_split(X, y, train_size=train_size, random_state=random_state)

In [None]:
sanitized_train_test = sanitize_posts(X_train_val)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(sanitized_train_test, y_train_val, train_size=train_size, random_state=random_state)

In [None]:
vectorizer = CountVectorizer(max_features=vectorizer_max_features, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(X_train).toarray()

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X_tfidf = tfidfconverter.fit_transform(X).toarray()

In [None]:
classifier = chosen_classifier()
classifier.fit(X_tfidf, y_train)

In [None]:
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

In [None]:
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [None]:
X_val_vectorized = vectorizer.transform(X_val)
X_val_tfidf = tfidfconverter.transform(X_val_vectorized)
y_pred = classifier.predict(X_val_tfidf)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))
print(accuracy_score(y_val, y_pred))

In [None]:
sanitized_holdback = sanitize_posts(X_holdback)
X_holdback_vectorized = vectorizer.transform(sanitized_holdback)
X_holdback_tfidf = tfidfconverter.transform(X_holdback_vectorized)
y_pred2 = model.predict(X_holdback_tfidf)

print(confusion_matrix(y_holdback, y_pred2))
print(classification_report(y_holdback, y_pred2))
print(accuracy_score(y_holdback, y_pred2)) 

In [None]:
print(chosen_classifier, vectorizer_max_features, train_size)

print("Accuracy:", accuracy_score(y_holdback, y_pred2))
print("Precision:", precision_score(y_holdback, y_pred2, average='micro'))
print("Precision:", precision_score(y_holdback, y_pred2, average=None))
cr = classification_report(y_holdback, y_pred2)
cr.split('\n')

In [None]:
fpr, tpr, thresholds = roc_curve(y_holdback, classifier.predict_proba(X_holdback_tfidf)[:,1], pos_label='P')

plt.plot(fpr, tpr,lw=2)
plt.plot([0,1],[0,1],c='violet',ls='--')
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for J-P');
print("ROC AUC score = ", roc_auc_score(y_holdback, classifier.predict_proba(X_holdback_tfidf)[:,1]))

## Run model on verticalized holdbacks

In [None]:
holdback_df = pd.DataFrame(zip(y_holdback, X_holdback), columns=('type', 'posts'))

In [None]:
holdback_post_list = [re.split('\|\|\|+', post) for post in holdback_df['posts']]
holdback_post_df = pd.DataFrame(holdback_post_list)
holdback_post_df.insert(loc=0, column='type', value=holdback_df['type'])

In [None]:
# Splits posts of holdback set into single posts.
def compress_posts(df):
    result = []
    df_rows = range(len(df))
    df_cols = range(len(df.iloc[0]) - 1)

    for i in df_rows:
        for j in df_cols:
            if df.iloc[i][j] != None:
                result.append([df['type'][i], df.iloc[i][j]])
    
    return pd.DataFrame(result, columns=('type', 'post'))

vertical_holdback = compress_posts(holdback_post_df)

In [None]:
sanitized_holdback = sanitize_posts(vertical_holdback['post'])

In [None]:
vertical_X_holdback = vectorizer.transform(sanitized_holdback).toarray()
vertical_X_holdback = tfidfconverter.fit_transform(vertical_X_holdback).toarray()
pred_holdback = classifier.predict(vertical_X_holdback)

In [None]:
print(chosen_classifier, vectorizer_max_features, train_size)

print("Accuracy:", accuracy_score(vertical_holdback['type'], pred_holdback))
print("Precision:", precision_score(vertical_holdback['type'], pred_holdback, average='micro'))
print("Precision:", precision_score(vertical_holdback['type'], pred_holdback, average=None))
cr = classification_report(vertical_holdback['type'], pred_holdback)
cr.split('\n')

In [None]:
plot_confusion_matrix(classifier, vertical_X_holdback, vertical_holdback['type'])
plt.grid(False)
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(vertical_holdback['type'], classifier.predict_proba(vertical_X_holdback)[:,1], pos_label='P')

plt.plot(fpr, tpr,lw=2)
plt.plot([0,1],[0,1],c='violet',ls='--')
plt.xlim([-0.05,1.05])
plt.ylim([-0.05,1.05])

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for J-P');
print("ROC AUC score = ", roc_auc_score(y_holdback, classifier.predict_proba(X_holdback_tfidf)[:,1]))

In [None]:
albert = "Hi, everyone!  I’m a San Francisco native who attended Caltech in Pasadena and has spent time all over the country.  My favorite cities are San Francisco, Boston, Raleigh, and Denver.  I am a bootcamp veteran, having acquired a skill set in web development, and where I, amazingly, met Josh Shaman who now works for Metis.  I bike, play piano, and dance in my spare time."

trans_albert = vectorizer.transform([albert]).toarray()
trans_albert = tfidfconverter.transform(trans_albert).toarray()
classifier.predict(trans_albert)