## In this file, I create holdback set with single posts.
## Resources
[NLTK][https://stackabuse.com/text-classification-with-python-and-scikit-learn/]

In [1]:
import nltk
import pickle
import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# nltk.download('stopwords')
# nltk.download('wordnet')

In [2]:
%config InlineBackend.figure_formats = ['svg']
%matplotlib inline

plt.rcParams['figure.figsize'] = (9, 6)
sns.set(context='notebook', style='whitegrid', font_scale=1.2)

In [3]:
train_size = 0.8
vectorizer_max_features = 1500
chosen_classifier = MultinomialNB

In [4]:
from functions import load_data_set
myers_briggs = load_data_set()

In [5]:
mb_df = pd.DataFrame(myers_briggs, columns=['type', 'posts'])
types = sorted(mb_df['type'].unique())

post_list = [re.split('\|\|\|+', post) for post in mb_df['posts']]
post_df = pd.DataFrame(post_list)
post_df.insert(loc=0, column='type', value=mb_df['type'])

posts_by_type = {typ: mb_df[mb_df['type'] == typ] for typ in types}

In [6]:
vertical_post_df = pd.read_csv('vertical_posts.csv', index_col=0)

## Split personalities into components.

In [7]:
mb_df['EI'] = mb_df['type'].apply(lambda x: x[0])
mb_df['NS'] = mb_df['type'].apply(lambda x: x[1])
mb_df['FT'] = mb_df['type'].apply(lambda x: x[2])
mb_df['JP'] = mb_df['type'].apply(lambda x: x[3])

EI_df = mb_df[['EI', 'posts']]
NS_df = mb_df[['NS', 'posts']]
FT_df = mb_df[['FT', 'posts']]
JP_df = mb_df[['JP', 'posts']]

In [9]:
X, y = NS_df['posts'], NS_df['NS']
# X, y = vertical_post_df['posts'], vertical_post_df['type']

## Might want to remove URLs

In [10]:
X_train_val, X_holdback, y_train_val, y_holdback = train_test_split(X, y)

In [11]:
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X_train_val)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X_train_val.iloc[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [12]:
vectorizer = CountVectorizer(max_features=vectorizer_max_features, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(X_train_val).toarray()

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y_train_val, train_size=train_size, random_state=0)

In [15]:
# classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier = chosen_classifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[1137    0]
 [ 165    0]]
              precision    recall  f1-score   support

           N       0.87      1.00      0.93      1137
           S       0.00      0.00      0.00       165

    accuracy                           0.87      1302
   macro avg       0.44      0.50      0.47      1302
weighted avg       0.76      0.87      0.81      1302

0.8732718894009217


In [17]:
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

In [18]:
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [19]:
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2)) 

[[1137    0]
 [ 165    0]]
              precision    recall  f1-score   support

           N       0.87      1.00      0.93      1137
           S       0.00      0.00      0.00       165

    accuracy                           0.87      1302
   macro avg       0.44      0.50      0.47      1302
weighted avg       0.76      0.87      0.81      1302

0.8732718894009217


In [20]:
print(chosen_classifier, vectorizer_max_features, train_size)

print("Accuracy:", accuracy_score(y_test, y_pred2))
print("Precision:", precision_score(y_test, y_pred2, average='micro'))
print("Precision:", precision_score(y_test, y_pred2, average=None))
cr = classification_report(y_test, y_pred2)
cr.split('\n')

<class 'sklearn.naive_bayes.MultinomialNB'> 1500 0.8
Accuracy: 0.8732718894009217
Precision: 0.8732718894009217
Precision: [0.87327189 0.        ]


['              precision    recall  f1-score   support',
 '',
 '           N       0.87      1.00      0.93      1137',
 '           S       0.00      0.00      0.00       165',
 '',
 '    accuracy                           0.87      1302',
 '   macro avg       0.44      0.50      0.47      1302',
 'weighted avg       0.76      0.87      0.81      1302',
 '']

## Run model on verticalized holdbacks

In [21]:
holdback_df = pd.DataFrame(zip(y_holdback, X_holdback), columns=('type', 'posts'))

In [22]:
holdback_post_list = [re.split('\|\|\|+', post) for post in holdback_df['posts']]
holdback_post_df = pd.DataFrame(holdback_post_list)
holdback_post_df.insert(loc=0, column='type', value=holdback_df['type'])

In [23]:
# Splits posts of holdback set into single posts.
def compress_posts(df):
    result = []
    df_length = range(len(df))

    for i in df_length:
        for j in range(58):
            if df.iloc[i][j] != None:
                result.append([df['type'][i], df.iloc[i][j]])
    
    return pd.DataFrame(result, columns=('type', 'post'))

vertical_holdback = compress_posts(holdback_post_df)

In [24]:
# sanitize and vectorize
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(vertical_holdback)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(vertical_holdback['post'][sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [25]:
vertical_X_holdback = vectorizer.transform(documents).toarray()
vertical_X_holdback = tfidfconverter.fit_transform(vertical_X_holdback).toarray()
pred_holdback = classifier.predict(vertical_X_holdback)

In [26]:
print(chosen_classifier, vectorizer_max_features, train_size)

print("Accuracy:", accuracy_score(vertical_holdback['type'], pred_holdback))
print("Precision:", precision_score(vertical_holdback['type'], pred_holdback, average='micro'))
print("Precision:", precision_score(vertical_holdback['type'], pred_holdback, average=None))
cr = classification_report(vertical_holdback['type'], pred_holdback)
cr.split('\n')

<class 'sklearn.naive_bayes.MultinomialNB'> 1500 0.8
Accuracy: 0.8673880412684539
Precision: 0.8673880412684539
Precision: [0.86746221 0.        ]


['              precision    recall  f1-score   support',
 '',
 '           N       0.87      1.00      0.93     91312',
 '           S       0.00      0.00      0.00     13950',
 '',
 '    accuracy                           0.87    105262',
 '   macro avg       0.43      0.50      0.46    105262',
 'weighted avg       0.75      0.87      0.81    105262',
 '']