In [46]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import gensim
import nltk
from nltk.tokenize import word_tokenize
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
import string
import joblib

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')


mod = joblib.load('./Data/best_model.pkl')
w2v_mod = joblib.load('./Data/w2v_model.pkl')
data = pd.read_csv('./Data/2.5k_reviews.csv')
stopwords = pd.read_csv("./Data/stop_words.csv")
stopwords=[i.replace('"',"").strip() for i in stopwords.columns]

## Get initial impression

In [7]:
test_case = data.sample(1)

# feature engineering
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords]
    return ' '.join(tokens)

X = test_case.review.apply(preprocess)
sentences = [sentence.split() for sentence in X]

w2v_model = gensim.models.Word2Vec(workers=4,min_count=5, window=2, vector_size=100, sample=5e-5, alpha=0.04, min_alpha=0.0002, negative=10)
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)
w = np.array([vectorize(sentence) for sentence in X])

new_X = w
new_y = test_case.sentiment.values

y_new_pred_rf = mod.predict(new_X)
if y_new_pred_rf == new_y:
    print("Classified "+("positive" if y_new_pred_rf is 1 else "negative")+" case correctly!\n")
    print(test_case.review.values[0])
else:
    print("Fail: classified "+("positive" if y_new_pred_rf is 1 else "negative")+" case wrong\n")
    print(test_case.review.values[0])

Fail: classified negative case wrong

Sorry to say that I cannot give this place more stars, but I have dined there many times and typically have been quite pleased. This last time was definitely a disappointment, as my chicken with pesto sauce was almost tasteless and the chicken had apparently no seasoning whatsoever; I've had better packaged frozen food!Perhaps the chef(?) was preoccupied with the large(20+) party in the adjoining room.....a poor excuse at best.....


In [14]:
print(y_new_pred_rf) # negative
print(new_y) # positive ??

[0]
[1]


## Get average values

In [55]:
# Testing model on random sample from dataframe

def predictNonLinear(model, N=100):
    test_case = data.sample(N)
    
    # feature engineering
    def preprocess(text):
        text = text.lower()
        text = ''.join([word for word in text if word not in string.punctuation])
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stopwords]
        return ' '.join(tokens)
    
    X = test_case.review.apply(preprocess)
    
    def vectorize(sentence):
        words = sentence.split()
        words_vecs = [w2v_mod.wv[word] for word in words if word in w2v_mod.wv]
        if len(words_vecs) == 0:
            return np.zeros(100)
        words_vecs = np.array(words_vecs)
        return words_vecs.mean(axis=0)
    w = np.array([vectorize(sentence) for sentence in X])
    
    new_X = w
    new_y = test_case.sentiment.values
    
    y_new_pred_rf = mod.predict(new_X)
    y_pred_report = classification_report(new_y,y_new_pred_rf, output_dict=True)
    
    return (y_pred_report["0"]["precision"],y_pred_report["0"]["recall"],
            y_pred_report["1"]["precision"],y_pred_report["1"]["recall"])

In [56]:
predictNonLinear(mod, 10)

(0.75, 1.0, 1.0, 0.8571428571428571)

In [83]:
# Get average values
nums = 20
avg_p0 = 0
avg_r0 = 0
avg_p1 = 0
avg_r1 = 0
for i in range(nums):
    p0,r0,p1,r1 = predictNonLinear(mod, 150)
    avg_p0 += p0
    avg_r0 += r0
    avg_p1 += p1
    avg_r1 += r1

avg_p0 = avg_p0/nums
avg_r0 = avg_r0/nums
avg_p1 = avg_p1/nums
avg_r1 = avg_r1/nums
print("Average Precision (0): "+str(avg_p0)+"; Average Recall (0): "+str(avg_r0))
print("Average Precision (1): "+str(avg_p1)+"; Average Recall (1): "+str(avg_r1))

Average Precision (0): 0.6058750804742254; Average Recall (0): 0.9766755814658181
Average Precision (1): 0.9873315302036992; Average Recall (1): 0.7378452622874492
