## Cross Validation

In [142]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import ast
import dataframe_image as dfi

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

w = pd.read_csv("./Data/vectorizedWords.csv").to_numpy()
df = pd.read_csv("./Data/2.5k_reviews.csv")

### Put vectorization into df as it was saved seperately

In [143]:
df["vectorization"] = [i for i in w]

### Data is unbalanced

In [144]:
num0s = df.sentiment.value_counts()[0] # Get number of 0s (negative reviews)
subset_index = df[df.sentiment == 1].sample(int(df.shape[0] - num0s*1.8)).index
subset = df.drop(subset_index).reset_index(drop=True)
print("Number of negative reviews: "+str(len(subset)-subset.sentiment.sum())+"; Number of positive reviews: "+str(subset.sentiment.sum()))

Number of negative reviews: 701; Number of positive reviews: 561


### Train

In [145]:
y = subset.sentiment.values
X = subset.vectorization.values

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state=1)

In [147]:
X_train = np.stack(X_train, axis=0) # Expand the array for model
X_test = np.stack(X_test, axis=0)

In [148]:
# Create the random grid
lr_random_grid = {'max_iter' : range(100, 500),
                  'warm_start' : [True, False],
                  'solver' : ['lbfgs', 'newton-cg', 'liblinear'],
                  'C' : np.arange(0, 1, 0.01)}
rf_random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 20, stop = 1000, num = 10)],
                  'max_features': ['auto', 'sqrt'],
                  'max_depth': [5,8,10,15],
                  'min_samples_split': [2, 5, 10],
                  'min_samples_leaf': [1, 2, 4],
                  'bootstrap': [True, False]}
knn_random_grid = {'n_neighbors': [3,5,7,9],
                   'metric':['euclidean', 'manhattan', 'minkowski'],
                   'weights': ['uniform', 'distance'],
                   'algorithm': ['auto' , 'ball_tree' , 'kd_tree' ,'brute']}

In [149]:
## Logistic Regression --
pipeline_lr = LogisticRegression()
pipeline_lr = RandomizedSearchCV(estimator = pipeline_lr, param_distributions = lr_random_grid, 
                                 n_iter = 100, cv = 3, verbose=False, random_state=42, n_jobs = -1)
pipeline_lr = pipeline_lr.fit(X_train, y_train)
y_pred_train_lr = pipeline_lr.predict(X_train)
y_pred_test_lr = pipeline_lr.predict(X_test)
print("Done LR")

## Random Forest ---
pipeline_rf = RandomForestClassifier(criterion="entropy")
pipeline_rf = RandomizedSearchCV(estimator = pipeline_rf, param_distributions = rf_random_grid, 
                                 n_iter = 100, cv = 3, verbose=False, random_state=42, n_jobs = -1)
pipeline_rf = pipeline_rf.fit(X_train, y_train)
y_pred_train_rf = pipeline_rf.predict(X_train)
y_pred_test_rf = pipeline_rf.predict(X_test)
print("Done RF")

## KNN -- 
pipeline_knn = KNeighborsClassifier(n_neighbors=3, metric="euclidean")
pipeline_knn = RandomizedSearchCV(estimator = pipeline_knn, param_distributions = knn_random_grid, 
                                 n_iter = 100, cv = 3, verbose=False, random_state=42, n_jobs = -1)
pipeline_knn = pipeline_knn.fit(X_train, y_train)
y_pred_train_knn = pipeline_knn.predict(X_train)
y_pred_test_knn = pipeline_knn.predict(X_test)
print("Done KNN")
# 5 mins

Done LR
Done RF
Done KNN


In [150]:
pipeline_rf.best_params_

{'n_estimators': 891,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 5,
 'bootstrap': True}

In [151]:
# Classification Report for Recall & Precision
lr_report_train = classification_report(y_train,y_pred_train_lr, output_dict=True)
lr_report_test = classification_report(y_test,y_pred_test_lr, output_dict=True)
rf_report_train = classification_report(y_train,y_pred_train_rf, output_dict=True)
rf_report_test = classification_report(y_test,y_pred_test_rf, output_dict=True)
knn_report_train = classification_report(y_train,y_pred_train_knn, output_dict=True)
knn_report_test = classification_report(y_test,y_pred_test_knn, output_dict=True)

lr_train_stuff = [lr_report_train["0"]["precision"],lr_report_train["0"]["recall"],
                  lr_report_train["1"]["precision"],lr_report_train["1"]["recall"]]
lr_test_stuff = [lr_report_test["0"]["precision"],lr_report_test["0"]["recall"],
                 lr_report_test["1"]["precision"],lr_report_test["1"]["recall"]]
rf_train_stuff = [rf_report_train["0"]["precision"],rf_report_train["0"]["recall"],
                  rf_report_train["1"]["precision"],rf_report_train["1"]["recall"]]
rf_test_stuff = [rf_report_test["0"]["precision"],rf_report_test["0"]["recall"],
                 rf_report_test["1"]["precision"],rf_report_test["1"]["recall"]]
knn_train_stuff = [knn_report_train["0"]["precision"],knn_report_train["0"]["recall"],
                   knn_report_train["1"]["precision"],knn_report_train["1"]["recall"]]
knn_test_stuff = [knn_report_test["0"]["precision"],knn_report_test["0"]["recall"],
                  knn_report_test["1"]["precision"],knn_report_test["1"]["recall"]]

index = ["LR_train","LR_test","RF_train","RF_test","KNN_train","KNN_test"]
columns =  ["Precision - 0","Recall - 0", "Precision - 1","Recall - 1"]
result = pd.DataFrame([lr_train_stuff,lr_test_stuff,rf_train_stuff,rf_test_stuff,knn_train_stuff,knn_test_stuff], index, columns)

In [152]:
result
# 1 for positive, 0 for negative, negative is 3 stars or less

Unnamed: 0,Precision - 0,Recall - 0,Precision - 1,Recall - 1
LR_train,0.734761,0.789381,0.70398,0.637387
LR_test,0.693333,0.764706,0.68932,0.606838
RF_train,0.83045,0.849558,0.802784,0.779279
RF_test,0.705882,0.705882,0.65812,0.65812
KNN_train,0.750422,0.787611,0.711538,0.666667
KNN_test,0.675862,0.720588,0.648148,0.598291


In [153]:
dfi.export(result.style,'./Results/crossValidationModels.png')