# This file runs and tests the Random Forest pkl model

In [79]:
import pandas as pd
import numpy as np
import joblib
import gensim
from nltk.tokenize import word_tokenize
import string
from sklearn.metrics import classification_report
import dataframe_image as dfi

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

mod = joblib.load('../Data/best_model.pkl')
w2v_mod = joblib.load('../Data/w2v_model.pkl')
data = pd.read_csv('../Data/2.5k_reviews.csv')
stopwords = list(pd.read_csv("../Data/stop_words.csv")["words"]) 

## 1.) Get initial impression with one sample

In [48]:
test_case = data.sample(1)

# Basically what we did in 02_featureEngineering.ipynb as we need to prepare the vector space features again
def preprocess(text):
    text = text.lower() # Convert to lowercase
    text = ''.join([word for word in text if word not in string.punctuation]) # Remove punctuation
    tokens = word_tokenize(text) # Splits string into a list of words (tokenizing)
    tokens = [word for word in tokens if word not in stopwords] # Remove stopwords (common words in English)
    return ' '.join(tokens) # Return full string

X = test_case.review.apply(preprocess) # Apply for all reviews

def vectorize(sentence):
    words = sentence.split() # Split sentence into list of words
    words_vecs = [w2v_mod.wv[word] for word in words if word in w2v_mod.wv] # Return word vectors 
    if len(words_vecs) == 0: 
        return np.zeros(100) # Vector space set to 100 Dimensions
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0) # Return mean for each Dimension
w = np.array([vectorize(sentence) for sentence in X]) # Vectorize each review

new_X = w
new_y = test_case.sentiment.values

y_new_pred_rf = mod.predict(new_X) # Ready to predict on sample test case
if y_new_pred_rf == new_y:
    print("Classified "+("positive" if y_new_pred_rf is 1 else "negative")+" case correctly!\n")
    print(test_case.review.values[0])
else:
    print("Fail: classified "+("positive" if y_new_pred_rf is 1 else "negative")+" case wrong\n")
    print(test_case.review.values[0])

Fail: classified negative case wrong

Sorry to say that I cannot give this place more stars, but I have dined there many times and typically have been quite pleased. This last time was definitely a disappointment, as my chicken with pesto sauce was almost tasteless and the chicken had apparently no seasoning whatsoever; I've had better packaged frozen food!Perhaps the chef(?) was preoccupied with the large(20+) party in the adjoining room.....a poor excuse at best.....


In [49]:
print("Predicted: "+str(y_new_pred_rf)) # negative
print("Actual: "+str(new_y)) # positive ??

Predicted: [0]
Actual: [1]


## 2.) Get average values for final RF model

In [75]:
# Testing model on random sample from dataset

def predictNonLinear(model, N=100):
    test_case = data.sample(N)
    
    # Basically what we did in 02_featureEngineering.ipynb as we need to prepare the vector space features again
    def preprocess(text):
        text = text.lower() # Convert to lowercase
        text = ''.join([word for word in text if word not in string.punctuation]) # Remove punctuation
        tokens = word_tokenize(text) # Splits string into a list of words (tokenizing)
        tokens = [word for word in tokens if word not in stopwords] # Remove stopwords (common words in English)
        return ' '.join(tokens) # Return full string
    
    X = test_case.review.apply(preprocess) # Apply for all reviews
    
    def vectorize(sentence):
        words = sentence.split() # Split sentence into list of words
        words_vecs = [w2v_mod.wv[word] for word in words if word in w2v_mod.wv] # Return word vectors 
        if len(words_vecs) == 0: 
            return np.zeros(100) # Vector space set to 100 Dimensions
        words_vecs = np.array(words_vecs)
        return words_vecs.mean(axis=0) # Return mean for each Dimension
    w = np.array([vectorize(sentence) for sentence in X]) # Vectorize each review
    
    new_X = w
    new_y = test_case.sentiment.values
    
    y_new_pred_rf = mod.predict(new_X) # Predict samples
    y_pred_report = classification_report(new_y,y_new_pred_rf, output_dict=True) # Output report as dictionary
    
    return (y_pred_report["0"]["precision"],y_pred_report["0"]["recall"],
            y_pred_report["1"]["precision"],y_pred_report["1"]["recall"])

In [76]:
# Get average values
nums = 20
avg_p0 = 0
avg_r0 = 0
avg_p1 = 0
avg_r1 = 0
for i in range(nums):
    p0,r0,p1,r1 = predictNonLinear(mod, 150)
    avg_p0 += p0
    avg_r0 += r0
    avg_p1 += p1
    avg_r1 += r1

avg_p0 = avg_p0/nums
avg_r0 = avg_r0/nums
avg_p1 = avg_p1/nums
avg_r1 = avg_r1/nums
print("Average Precision (0): "+str(avg_p0)+"; Average Recall (0): "+str(avg_r0))
print("Average Precision (1): "+str(avg_p1)+"; Average Recall (1): "+str(avg_r1))

Average Precision (0): 0.600794665982024; Average Recall (0): 0.9828618861266122
Average Precision (1): 0.9905027534696815; Average Recall (1): 0.7423732223061903


In [80]:
# Save table of results
index = ["RF - Final Model"]
columns =  ["Precision - 0","Recall - 0","Precision - 1","Recall - 1"]
result = pd.DataFrame([[avg_p0,avg_r0,avg_p1,avg_r1]], index, columns)
dfi.export(result.T.style,'../Results/05_finalAverageScores.png')