## In this file, I create holdback set with single posts.
## Resources
[NLTK][https://stackabuse.com/text-classification-with-python-and-scikit-learn/]

In [1]:
import nltk
import pickle
import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# nltk.download('stopwords')
# nltk.download('wordnet')

In [2]:
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [3]:
train_size = 0.01
vectorizer_max_features = 1500
chosen_classifier = RandomForestClassifier

In [4]:
from functions import load_data_set
myers_briggs = load_data_set()

In [5]:
mb_df = pd.DataFrame(myers_briggs, columns=['type', 'posts'])
types = sorted(mb_df['type'].unique())

post_list = [re.split('\|\|\|+', post) for post in mb_df['posts']]
post_df = pd.DataFrame(post_list)
post_df.insert(loc=0, column='type', value=mb_df['type'])

posts_by_type = {typ: mb_df[mb_df['type'] == typ] for typ in types}

In [6]:
vertical_post_df = pd.read_csv('vertical_posts.csv', index_col=0)

In [7]:
X, y = mb_df['posts'], mb_df['type']
# X, y = vertical_post_df['posts'], vertical_post_df['type']

## Might want to remove URLs

In [8]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [10]:
X_train_val, X_holdback, y_train_val, y_holdback = train_test_split(documents, y)

In [11]:
vectorizer = CountVectorizer(max_features=vectorizer_max_features, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(X_train_val).toarray()

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y_train_val, train_size=train_size, random_state=0)

In [14]:
# classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier = chosen_classifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [15]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[  0   4   0   0   0   0   0   0  48  64   0  23   0   0   0   0]
 [  0 127   0   0   0   0   0   0  99 240   0  42   0   0   0   0]
 [  0  11   0   0   0   0   0   0  51  88   0  26   0   0   0   0]
 [  0  43   0   0   0   0   0   0 138 261   0  71   0   0   1   0]
 [  0   4   0   0   0   0   0   0   8  12   0   9   0   0   0   0]
 [  0   4   0   0   0   0   0   0   3  19   0   9   0   0   0   0]
 [  0   2   0   0   0   0   0   0   4  21   0   6   0   0   0   0]
 [  0   3   0   0   0   0   0   0  16  36   0   9   0   0   0   0]
 [  0  27   0   0   0   0   0   0 531 411   0 123   0   0   0   0]
 [  0  44   0   0   0   0   0   0 154 981   0 142   0   0   1   0]
 [  0  44   0   0   0   0   0   0 229 374   0 151   0   0   2   0]
 [  0  26   0   0   0   0   0   0 203 496   0 267   0   0   1   0]
 [  0   7   0   0   0   0   0   0  26  64   0  25   0   0   2   0]
 [  0  12   0   0   0   0   0   0  34 117   0  41   0   0   0   0]
 [  0   7   0   0   0   0   0   0  30  79   0  29   0   0   3 

In [16]:
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

In [17]:
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [18]:
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2)) 

[[  0   4   0   0   0   0   0   0  48  64   0  23   0   0   0   0]
 [  0 127   0   0   0   0   0   0  99 240   0  42   0   0   0   0]
 [  0  11   0   0   0   0   0   0  51  88   0  26   0   0   0   0]
 [  0  43   0   0   0   0   0   0 138 261   0  71   0   0   1   0]
 [  0   4   0   0   0   0   0   0   8  12   0   9   0   0   0   0]
 [  0   4   0   0   0   0   0   0   3  19   0   9   0   0   0   0]
 [  0   2   0   0   0   0   0   0   4  21   0   6   0   0   0   0]
 [  0   3   0   0   0   0   0   0  16  36   0   9   0   0   0   0]
 [  0  27   0   0   0   0   0   0 531 411   0 123   0   0   0   0]
 [  0  44   0   0   0   0   0   0 154 981   0 142   0   0   1   0]
 [  0  44   0   0   0   0   0   0 229 374   0 151   0   0   2   0]
 [  0  26   0   0   0   0   0   0 203 496   0 267   0   0   1   0]
 [  0   7   0   0   0   0   0   0  26  64   0  25   0   0   2   0]
 [  0  12   0   0   0   0   0   0  34 117   0  41   0   0   0   0]
 [  0   7   0   0   0   0   0   0  30  79   0  29   0   0   3 

In [19]:
print(chosen_classifier, vectorizer_max_features, train_size)

print("Accuracy:", accuracy_score(y_test, y_pred2))
print("Precision:", precision_score(y_test, y_pred2, average='micro'))
print("Precision:", precision_score(y_test, y_pred2, average=None))
cr = classification_report(y_test, y_pred2)
cr.split('\n')

<class 'sklearn.ensemble._forest.RandomForestClassifier'> 1500 0.01
Accuracy: 0.2963825492935879
Precision: 0.2963825492935879
Precision: [0.         0.33687003 0.         0.         0.         0.
 0.         0.         0.3271719  0.28667446 0.         0.26461843
 0.         0.         0.3        0.        ]


['              precision    recall  f1-score   support',
 '',
 '        ENFJ       0.00      0.00      0.00       139',
 '        ENFP       0.34      0.25      0.29       508',
 '        ENTJ       0.00      0.00      0.00       176',
 '        ENTP       0.00      0.00      0.00       514',
 '        ESFJ       0.00      0.00      0.00        33',
 '        ESFP       0.00      0.00      0.00        35',
 '        ESTJ       0.00      0.00      0.00        33',
 '        ESTP       0.00      0.00      0.00        64',
 '        INFJ       0.33      0.49      0.39      1092',
 '        INFP       0.29      0.74      0.41      1322',
 '        INTJ       0.00      0.00      0.00       800',
 '        INTP       0.26      0.27      0.27       993',
 '        ISFJ       0.00      0.00      0.00       124',
 '        ISFP       0.00      0.00      0.00       204',
 '        ISTJ       0.30      0.02      0.04       148',
 '        ISTP       0.00      0.00      0.00       256',
 '',
 '  