## In this file, I create holdback set with single posts.
## Resources
[NLTK][https://stackabuse.com/text-classification-with-python-and-scikit-learn/]

In [1]:
import nltk
import pickle
import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# nltk.download('stopwords')
# nltk.download('wordnet')

In [2]:
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [3]:
train_size = 0.8
vectorizer_max_features = 1500
chosen_classifier = RandomForestClassifier

In [4]:
from functions import load_data_set
myers_briggs = load_data_set()

In [5]:
mb_df = pd.DataFrame(myers_briggs, columns=['type', 'posts'])
types = sorted(mb_df['type'].unique())

post_list = [re.split('\|\|\|+', post) for post in mb_df['posts']]
post_df = pd.DataFrame(post_list)
post_df.insert(loc=0, column='type', value=mb_df['type'])

posts_by_type = {typ: mb_df[mb_df['type'] == typ] for typ in types}

In [6]:
vertical_post_df = pd.read_csv('vertical_posts.csv', index_col=0)

In [7]:
X, y = mb_df['posts'], mb_df['type']
# X, y = vertical_post_df['posts'], vertical_post_df['type']

## Might want to remove URLs

In [8]:
X_train_val, X_holdback, y_train_val, y_holdback = train_test_split(X, y)

In [9]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X_train_val)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X_train_val.iloc[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [10]:
vectorizer = CountVectorizer(max_features=vectorizer_max_features, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(X_train_val).toarray()

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y_train_val, train_size=train_size, random_state=0)

In [13]:
# classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier = chosen_classifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [14]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[  0   2   0   0   0   0   0   0   9  12   2   1   0   0   0   1]
 [  0  39   0   1   0   0   0   0  17  36   8   5   0   0   0   2]
 [  0   0   7   2   0   0   0   0   6   5   3   3   0   0   0   0]
 [  0   4   0  44   0   0   0   0  12  17   6  14   0   0   1   0]
 [  0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0]
 [  0   0   1   0   0   0   0   0   3   3   1   1   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   1   0   1   0   0   0   0]
 [  0   1   1   2   0   0   0   0   1   2   3   3   0   0   0   1]
 [  0   3   0   4   0   0   0   0 148  50   7   4   0   0   0   0]
 [  0   1   0   0   0   0   0   0  24 264   3   7   0   1   0   0]
 [  0   2   0   3   0   0   0   0  21  22 100  22   0   0   1   0]
 [  0   1   0   3   0   0   0   0   7  27  10 151   0   0   0   0]
 [  0   1   0   1   0   0   0   0   4   8   1   2   4   1   0   0]
 [  0   0   0   1   0   0   0   0   8  20   2   1   0   6   0   0]
 [  0   2   0   0   0   0   0   0   4   9   5   6   0   0   3 

In [15]:
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

In [16]:
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [17]:
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2)) 

[[  0   2   0   0   0   0   0   0   9  12   2   1   0   0   0   1]
 [  0  39   0   1   0   0   0   0  17  36   8   5   0   0   0   2]
 [  0   0   7   2   0   0   0   0   6   5   3   3   0   0   0   0]
 [  0   4   0  44   0   0   0   0  12  17   6  14   0   0   1   0]
 [  0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0]
 [  0   0   1   0   0   0   0   0   3   3   1   1   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   1   0   1   0   0   0   0]
 [  0   1   1   2   0   0   0   0   1   2   3   3   0   0   0   1]
 [  0   3   0   4   0   0   0   0 148  50   7   4   0   0   0   0]
 [  0   1   0   0   0   0   0   0  24 264   3   7   0   1   0   0]
 [  0   2   0   3   0   0   0   0  21  22 100  22   0   0   1   0]
 [  0   1   0   3   0   0   0   0   7  27  10 151   0   0   0   0]
 [  0   1   0   1   0   0   0   0   4   8   1   2   4   1   0   0]
 [  0   0   0   1   0   0   0   0   8  20   2   1   0   6   0   0]
 [  0   2   0   0   0   0   0   0   4   9   5   6   0   0   3 

In [18]:
print(chosen_classifier, vectorizer_max_features, train_size)

print("Accuracy:", accuracy_score(y_test, y_pred2))
print("Precision:", precision_score(y_test, y_pred2, average='micro'))
print("Precision:", precision_score(y_test, y_pred2, average=None))
cr = classification_report(y_test, y_pred2)
cr.split('\n')

<class 'sklearn.ensemble._forest.RandomForestClassifier'> 1500 0.8
Accuracy: 0.6006144393241167
Precision: 0.6006144393241167
Precision: [0.         0.69642857 0.77777778 0.72131148 0.         0.
 0.         0.         0.54814815 0.54320988 0.64935065 0.65938865
 1.         0.75       0.6        0.8       ]


['              precision    recall  f1-score   support',
 '',
 '        ENFJ       0.00      0.00      0.00        27',
 '        ENFP       0.70      0.36      0.48       108',
 '        ENTJ       0.78      0.27      0.40        26',
 '        ENTP       0.72      0.45      0.55        98',
 '        ESFJ       0.00      0.00      0.00         1',
 '        ESFP       0.00      0.00      0.00         9',
 '        ESTJ       0.00      0.00      0.00         2',
 '        ESTP       0.00      0.00      0.00        14',
 '        INFJ       0.55      0.69      0.61       216',
 '        INFP       0.54      0.88      0.67       300',
 '        INTJ       0.65      0.58      0.62       171',
 '        INTP       0.66      0.76      0.71       199',
 '        ISFJ       1.00      0.18      0.31        22',
 '        ISFP       0.75      0.16      0.26        38',
 '        ISTJ       0.60      0.10      0.18        29',
 '        ISTP       0.80      0.38      0.52        42',
 '',
 '  

## Run model on verticalized holdbacks

In [19]:
holdback_df = pd.DataFrame(zip(y_holdback, X_holdback), columns=('type', 'posts'))

In [20]:
holdback_post_list = [re.split('\|\|\|+', post) for post in holdback_df['posts']]
holdback_post_df = pd.DataFrame(holdback_post_list)
holdback_post_df.insert(loc=0, column='type', value=holdback_df['type'])

In [22]:
# Splits posts of holdback set into single posts.
def compress_posts(df):
    result = []
    df_length = range(len(df))

    for i in df_length:
        for j in range(58):
            if df.iloc[i][j] != None:
                result.append([df['type'][i], df.iloc[i][j]])
    
    return pd.DataFrame(result, columns=('type', 'post'))

vertical_holdback = compress_posts(holdback_post_df)

In [23]:
# sanitize and vectorize
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(vertical_holdback)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(vertical_holdback['post'][sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [24]:
vertical_X_holdback = vectorizer.transform(documents).toarray()
vertical_X_holdback = tfidfconverter.fit_transform(vertical_X_holdback).toarray()
pred_holdback = classifier.predict(vertical_X_holdback)

In [26]:
print(chosen_classifier, vectorizer_max_features, train_size)

print("Accuracy:", accuracy_score(vertical_holdback['type'], pred_holdback))
print("Precision:", precision_score(vertical_holdback['type'], pred_holdback, average='micro'))
print("Precision:", precision_score(vertical_holdback['type'], pred_holdback, average=None))
cr = classification_report(vertical_holdback['type'], pred_holdback)
cr.split('\n')

<class 'sklearn.ensemble._forest.RandomForestClassifier'> 1500 0.8
Accuracy: 0.21017306359588747
Precision: 0.21017306359588747
Precision: [0.30810811 0.10579173 0.23310345 0.42866324 0.         0.
 0.         0.1875     0.58827786 0.22969895 0.28028763 0.41295337
 0.24545455 0.34740883 0.27010622 0.07834788]


['              precision    recall  f1-score   support',
 '',
 '        ENFJ       0.31      0.07      0.11      2506',
 '        ENFP       0.11      0.37      0.16      8836',
 '        ENTJ       0.23      0.06      0.09      2915',
 '        ENTP       0.43      0.09      0.14      7753',
 '        ESFJ       0.00      0.00      0.00       525',
 '        ESFP       0.00      0.00      0.00       867',
 '        ESTJ       0.00      0.00      0.00       246',
 '        ESTP       0.19      0.03      0.05       964',
 '        INFJ       0.59      0.09      0.15     18727',
 '        INFP       0.23      0.58      0.33     20844',
 '        INTJ       0.28      0.13      0.18     13730',
 '        INTP       0.41      0.10      0.16     15965',
 '        ISFJ       0.25      0.07      0.11      1526',
 '        ISFP       0.35      0.05      0.09      3495',
 '        ISTJ       0.27      0.07      0.11      2667',
 '        ISTP       0.08      0.10      0.09      3771',
 '',
 '  