In [70]:
import pandas as pd
import numpy as np
import sklearn 
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
%matplotlib inline

# 1ST ITERATION OF CLASSIFIER

In [71]:
# Read in Data
imdb = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_imdb.txt',delimiter='\t',header=None)
amazon = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_amazon.txt',delimiter='\t',header=None)
yelp = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_yelp.txt',delimiter='\t',header=None)

# Change column names to make data uniform
imdb.columns = ['review', 'positive']
yelp.columns = ['review', 'positive']
amazon.columns = ['review', 'positive']

In [72]:
keywords = ['terrible', 'awful', 'worst', 'bad', 'stupid', 'poor', 'worse', 'attempt', 'crap', 'fail']
for key in keywords:
    imdb[str(key)] = imdb.review.str.contains(str(key),case=False)
    yelp[str(key)] = yelp.review.str.contains(str(key),case=False)
    amazon[str(key)] = amazon.review.str.contains(str(key),case=False)

In [73]:
# Define functions to save space below

# Define function to create a binary model from data selected
def fit_binary_model(data,target):
    bnb = BernoulliNB()
    bnb.fit(data[keywords], target)
    pred = bnb.predict(data)
    print('Out of {} predictions, {} were misclassified giving {}% accuracy'.format(data.shape[0],
                                                                                    (pred != target).sum(),
                                                                                    round((100-((pred != target).sum() / data.shape[0])*100),1)))
# Define function to generate confusion matrix of results generated by model
def get_conf_matrix(data,target):
    bnb = BernoulliNB()
    bnb.fit(data, target)
    pred = bnb.predict(data)
    matrix = pd.DataFrame(confusion_matrix(target,pred))
    matrix.columns=("Negative Reviews","Positive Reviews")
    matrix.index = ("Number Predicted Negative","Number Predicted Positive")
    return matrix

# Create function to compare outputs from models after defining new keywords below
def compare_models():
    # Run model for IMDB Reviews
    print('-'*100,)
    print("FOR THE IMDB DATASET")
    input_df = imdb[keywords]
    target_df = imdb['positive']
    fit_binary_model(input_df,target_df)
    print(get_conf_matrix(input_df,target_df))

    # Run model for Yelp Reviews
    print('-'*100,)
    print("FOR THE YELP DATASET")
    input_df = yelp[keywords]
    target_df = yelp['positive']
    fit_binary_model(input_df,target_df)
    print(get_conf_matrix(input_df,target_df))

    # Run model for Amazon Reviews
    print('-'*100,)
    print("FOR THE AMAZON DATASET")
    input_df = amazon[keywords]
    target_df = amazon['positive']
    fit_binary_model(input_df,target_df)
    print(get_conf_matrix(input_df,target_df))

In [74]:
compare_models()

----------------------------------------------------------------------------------------------------
FOR THE IMDB DATASET
Out of 748 predictions, 268 were misclassified giving 64.2% accuracy
                           Negative Reviews  Positive Reviews
Number Predicted Negative               105               257
Number Predicted Positive                11               375
----------------------------------------------------------------------------------------------------
FOR THE YELP DATASET
Out of 1000 predictions, 446 were misclassified giving 55.4% accuracy
                           Negative Reviews  Positive Reviews
Number Predicted Negative                55               445
Number Predicted Positive                 1               499
----------------------------------------------------------------------------------------------------
FOR THE AMAZON DATASET
Out of 1000 predictions, 434 were misclassified giving 56.6% accuracy
                           Negative Reviews  Positi

# 2ND ITERATION OF CLASSIFIER

In [75]:
# Read in Data again
imdb = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_imdb.txt',delimiter='\t',header=None)
amazon = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_amazon.txt',delimiter='\t',header=None)
yelp = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_yelp.txt',delimiter='\t',header=None)

# Change column names to make data uniform
imdb.columns = ['review', 'positive']
yelp.columns = ['review', 'positive']
amazon.columns = ['review', 'positive']

In [76]:
keywords = ['terrible', 'awful', 'worst', 'bad', 'stupid', 'poor', 'worse', 'attempt', 'crap', 'fail', 
            'annoying', 'cheap','lousy','torture', 'ridiculous', 'not', 'unbelievable', 'skip', 'shame']
for key in keywords:
    imdb[str(key)] = imdb.review.str.contains(str(key),case=False)
    yelp[str(key)] = yelp.review.str.contains(str(key),case=False)
    amazon[str(key)] = amazon.review.str.contains(str(key),case=False)

In [77]:
compare_models()

----------------------------------------------------------------------------------------------------
FOR THE IMDB DATASET
Out of 748 predictions, 225 were misclassified giving 69.9% accuracy
                           Negative Reviews  Positive Reviews
Number Predicted Negative               164               198
Number Predicted Positive                27               359
----------------------------------------------------------------------------------------------------
FOR THE YELP DATASET
Out of 1000 predictions, 362 were misclassified giving 63.8% accuracy
                           Negative Reviews  Positive Reviews
Number Predicted Negative               161               339
Number Predicted Positive                23               477
----------------------------------------------------------------------------------------------------
FOR THE AMAZON DATASET
Out of 1000 predictions, 343 were misclassified giving 65.7% accuracy
                           Negative Reviews  Positi

# 3RD ITERATION OF CLASSIFIER

In [78]:
# Read in Data again
imdb = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_imdb.txt',delimiter='\t',header=None)
amazon = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_amazon.txt',delimiter='\t',header=None)
yelp = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_yelp.txt',delimiter='\t',header=None)

# Change column names to make data uniform
imdb.columns = ['review', 'positive']
yelp.columns = ['review', 'positive']
amazon.columns = ['review', 'positive']

In [79]:
keywords = ['terrible', 'awful', 'worst', 'bad', 'stupid', 'poor', 'worse', 'attempt', 'crap', 'fail', 
            'annoying', 'cheap','lousy','torture', 'ridiculous', 'not', 'unbelievable', 'skip', 'shame', 
            'not even', 'miss', 'terrific', 'best', 'great', 'fun']

for key in keywords:
    imdb[str(key)] = imdb.review.str.contains(str(key),case=False)
    yelp[str(key)] = yelp.review.str.contains(str(key),case=False)
    amazon[str(key)] = amazon.review.str.contains(str(key),case=False)

In [80]:
compare_models()

----------------------------------------------------------------------------------------------------
FOR THE IMDB DATASET
Out of 748 predictions, 223 were misclassified giving 70.2% accuracy
                           Negative Reviews  Positive Reviews
Number Predicted Negative               165               197
Number Predicted Positive                26               360
----------------------------------------------------------------------------------------------------
FOR THE YELP DATASET
Out of 1000 predictions, 359 were misclassified giving 64.1% accuracy
                           Negative Reviews  Positive Reviews
Number Predicted Negative               160               340
Number Predicted Positive                19               481
----------------------------------------------------------------------------------------------------
FOR THE AMAZON DATASET
Out of 1000 predictions, 343 were misclassified giving 65.7% accuracy
                           Negative Reviews  Positi

# 4TH ITERATION OF CLASSIFIER

In [81]:
# Read in Data again
imdb = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_imdb.txt',delimiter='\t',header=None)
amazon = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_amazon.txt',delimiter='\t',header=None)
yelp = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_yelp.txt',delimiter='\t',header=None)

# Change column names to make data uniform
imdb.columns = ['review', 'positive']
yelp.columns = ['review', 'positive']
amazon.columns = ['review', 'positive']

In [82]:
keywords = ['terrible', 'awful', 'worst', 'bad', 'stupid', 'poor', 'worse', 'attempt', 'crap', 'fail', 
            'annoying', 'cheap','lousy', 'unfortunate', 'boring', 'sucks', 'sucked', 'waste',
            'torture', ' lack', 'lame', 'ridiculous', 'not', 'unbelievable', 'skip', 'shame', 
           'not even', 'miss', 'terrific', 'best', 'great', 'fun']
for key in keywords:
    imdb[str(key)] = imdb.review.str.contains(str(key),case=False)
    yelp[str(key)] = yelp.review.str.contains(str(key),case=False)
    amazon[str(key)] = amazon.review.str.contains(str(key),case=False)

In [83]:
compare_models()

----------------------------------------------------------------------------------------------------
FOR THE IMDB DATASET
Out of 748 predictions, 202 were misclassified giving 73.0% accuracy
                           Negative Reviews  Positive Reviews
Number Predicted Negative               184               178
Number Predicted Positive                24               362
----------------------------------------------------------------------------------------------------
FOR THE YELP DATASET
Out of 1000 predictions, 340 were misclassified giving 66.0% accuracy
                           Negative Reviews  Positive Reviews
Number Predicted Negative               179               321
Number Predicted Positive                19               481
----------------------------------------------------------------------------------------------------
FOR THE AMAZON DATASET
Out of 1000 predictions, 321 were misclassified giving 67.9% accuracy
                           Negative Reviews  Positi

# FINAL ITERATION OF CLASSIFIER

In [84]:
# Read in Data again
imdb = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_imdb.txt',delimiter='\t',header=None)
amazon = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_amazon.txt',delimiter='\t',header=None)
yelp = pd.read_csv('C:\\Users\halln\Desktop\THINKFUL\Datasets\sentiment_detector_model\data_yelp.txt',delimiter='\t',header=None)

# Change column names to make data uniform
imdb.columns = ['review', 'positive']
yelp.columns = ['review', 'positive']
amazon.columns = ['review', 'positive']

In [85]:
keywords = ['terrible', 'awful', 'worst', 'bad', 'stupid', 'poor', 'worse', 'attempt', 'crap', 'fail', 'annoying', 'cheap',
           'painful', 'avoid', 'slow', 'pretentious', 'problem', 'embarrassing', 'bored', 'horrible', 'lousy', 'unfortunate', 
           'boring', 'sucks', 'sucked', 'waste', ' mess ', 'wasting', 'mediocre', 'sloppy',
           'disappoint', 'garbage', 'whine', 'whiny', 'plot', 'hate ', 'hated', 'negative', 'nobody', 'flaw',
           'script', 'insult', 'do not', 'torture', ' lack', 'lame', 'ridiculous', 'not', 'unbelievable', 'skip', 'shame', 
           'not even', 'miss', 'excellent', 'amazing', 'love', 'incredible', 'fantastic', 'terrific', 'best', 'great', 'fun']
for key in keywords:
    imdb[str(key)] = imdb.review.str.contains(str(key),case=False)
    yelp[str(key)] = yelp.review.str.contains(str(key),case=False)
    amazon[str(key)] = amazon.review.str.contains(str(key),case=False)

In [86]:
compare_models()

----------------------------------------------------------------------------------------------------
FOR THE IMDB DATASET
Out of 748 predictions, 158 were misclassified giving 78.9% accuracy
                           Negative Reviews  Positive Reviews
Number Predicted Negative               233               129
Number Predicted Positive                29               357
----------------------------------------------------------------------------------------------------
FOR THE YELP DATASET
Out of 1000 predictions, 291 were misclassified giving 70.9% accuracy
                           Negative Reviews  Positive Reviews
Number Predicted Negative               229               271
Number Predicted Positive                20               480
----------------------------------------------------------------------------------------------------
FOR THE AMAZON DATASET
Out of 1000 predictions, 273 were misclassified giving 72.7% accuracy
                           Negative Reviews  Positi