## In this file, I create holdback set with single posts.
## Resources
[NLTK][https://stackabuse.com/text-classification-with-python-and-scikit-learn/]

In [1]:
import nltk
import pickle
import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# nltk.download('stopwords')
# nltk.download('wordnet')

In [2]:
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [3]:
train_size = 0.8
vectorizer_max_features = 1500
chosen_classifier = RandomForestClassifier

In [4]:
from functions import load_data_set
myers_briggs = load_data_set()

In [5]:
mb_df = pd.DataFrame(myers_briggs, columns=['type', 'posts'])
types = sorted(mb_df['type'].unique())

post_list = [re.split('\|\|\|+', post) for post in mb_df['posts']]
post_df = pd.DataFrame(post_list)
post_df.insert(loc=0, column='type', value=mb_df['type'])

posts_by_type = {typ: mb_df[mb_df['type'] == typ] for typ in types}

In [6]:
vertical_post_df = pd.read_csv('vertical_posts.csv', index_col=0)

In [7]:
X, y = mb_df['posts'], mb_df['type']
# X, y = vertical_post_df['posts'], vertical_post_df['type']

## Might want to remove URLs

In [8]:
X_train_val, X_holdback, y_train_val, y_holdback = train_test_split(X, y)

In [9]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X_train_val)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X_train_val.iloc[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [10]:
vectorizer = CountVectorizer(max_features=vectorizer_max_features, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(X_train_val).toarray()

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y_train_val, train_size=train_size, random_state=0)

In [13]:
# classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier = chosen_classifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [14]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[  0   0   0   1   0   0   0   0   9   8   0   2   0   0   0   0]
 [  0  48   0   4   0   0   0   0  11  24   6   5   0   0   0   1]
 [  1   1   3   1   0   0   0   0   4   6  10   8   0   0   0   0]
 [  0   3   0  38   0   0   0   0  15   8   5  19   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   1   0   2   0   0   0   0]
 [  0   0   0   0   0   0   0   0   3   2   0   3   0   0   0   0]
 [  0   1   0   1   0   0   0   0   1   2   2   0   0   0   0   0]
 [  0   1   0   2   0   0   0   0   4   7   3   2   0   0   0   0]
 [  0   5   0   3   0   0   0   0 165  29  10   8   0   0   0   0]
 [  0   0   0   3   0   0   0   0  22 257   2  12   0   0   0   1]
 [  0   4   0   1   0   0   0   0  16  24  90  20   0   0   0   0]
 [  0   1   0   6   0   0   0   0  19  26   8 153   0   0   0   0]
 [  0   0   0   1   0   0   0   0   3  11   1   3   7   0   1   0]
 [  0   3   0   0   0   0   0   0   3  16   1   2   0   2   0   0]
 [  0   0   0   0   0   0   0   0   3  14   6   6   0   0   1 

In [15]:
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

In [16]:
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [17]:
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2)) 

[[  0   0   0   1   0   0   0   0   9   8   0   2   0   0   0   0]
 [  0  48   0   4   0   0   0   0  11  24   6   5   0   0   0   1]
 [  1   1   3   1   0   0   0   0   4   6  10   8   0   0   0   0]
 [  0   3   0  38   0   0   0   0  15   8   5  19   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   1   0   2   0   0   0   0]
 [  0   0   0   0   0   0   0   0   3   2   0   3   0   0   0   0]
 [  0   1   0   1   0   0   0   0   1   2   2   0   0   0   0   0]
 [  0   1   0   2   0   0   0   0   4   7   3   2   0   0   0   0]
 [  0   5   0   3   0   0   0   0 165  29  10   8   0   0   0   0]
 [  0   0   0   3   0   0   0   0  22 257   2  12   0   0   0   1]
 [  0   4   0   1   0   0   0   0  16  24  90  20   0   0   0   0]
 [  0   1   0   6   0   0   0   0  19  26   8 153   0   0   0   0]
 [  0   0   0   1   0   0   0   0   3  11   1   3   7   0   1   0]
 [  0   3   0   0   0   0   0   0   3  16   1   2   0   2   0   0]
 [  0   0   0   0   0   0   0   0   3  14   6   6   0   0   1 

In [18]:
print(chosen_classifier, vectorizer_max_features, train_size)

print("Accuracy:", accuracy_score(y_test, y_pred2))
print("Precision:", precision_score(y_test, y_pred2, average='micro'))
print("Precision:", precision_score(y_test, y_pred2, average=None))
cr = classification_report(y_test, y_pred2)
cr.split('\n')

<class 'sklearn.ensemble._forest.RandomForestClassifier'> 1500 0.8
Accuracy: 0.5967741935483871
Precision: 0.5967741935483871
Precision: [0.         0.70588235 1.         0.59375    0.         0.
 0.         0.         0.57291667 0.57238307 0.61643836 0.59533074
 1.         1.         0.5        0.86666667]


['              precision    recall  f1-score   support',
 '',
 '        ENFJ       0.00      0.00      0.00        20',
 '        ENFP       0.71      0.48      0.57        99',
 '        ENTJ       1.00      0.09      0.16        34',
 '        ENTP       0.59      0.43      0.50        88',
 '        ESFJ       0.00      0.00      0.00         4',
 '        ESFP       0.00      0.00      0.00         8',
 '        ESTJ       0.00      0.00      0.00         7',
 '        ESTP       0.00      0.00      0.00        19',
 '        INFJ       0.57      0.75      0.65       220',
 '        INFP       0.57      0.87      0.69       297',
 '        INTJ       0.62      0.58      0.60       155',
 '        INTP       0.60      0.72      0.65       213',
 '        ISFJ       1.00      0.26      0.41        27',
 '        ISFP       1.00      0.07      0.14        27',
 '        ISTJ       0.50      0.03      0.06        30',
 '        ISTP       0.87      0.24      0.38        54',
 '',
 '  

In [42]:
holdback_df = pd.DataFrame(zip(y_holdback, X_holdback), columns=('type', 'posts'))

In [39]:
holdback_post_list = [re.split('\|\|\|+', post) for post in holdback_df['posts']]
holdback_post_df = pd.DataFrame(holdback_post_list)
holdback_post_df.insert(loc=0, column='type', value=holdback_df['type'])

# holdback_posts_by_type = {typ: holdback_df[holdback_df['type'] == typ] for typ in types}
# holdback_posts_by_type_df = pd.DataFrame([holdback_posts_by_type.keys(), holdback_posts_by_type.values()])

In [45]:
# Convert post_df to a two-column data set.
def compress_posts(df):
    result = []
    df_length = range(len(df))

    for i in df_length:
        for j in range(59):
            if df.iloc[i][j] != None:
                result.append([df['type'][i], df.iloc[i][j]])
    
    return pd.DataFrame(result, columns=('type', 'post'))

vertical_holdback = compress_posts(holdback_post_df)
