## Classifying whether feedback left on a website is either positive or negative.

In [3]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
amazon = pd.read_csv("amazon_cells_labelled.txt",delimiter="\t",header=None)
amazon.columns = ['feedback','score']
amazon.head(5)
amazon.shape
# score 1 (positive), score 0 (negative)

(1000, 2)

In [5]:
# import negative sentiment words data to create list
# source citation: 
#    Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." 
#;       Proceedings of the ACM SIGKDD International Conference on Knowledge 
#;       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, 
#;       Washington, USA, 
neg_words = pd.read_csv("negative-words.txt",delimiter='\t',encoding="ISO-8859-1",skiprows=34,header=None)
neg_words.columns = ["Negative Words"]

In [6]:
# change score to boolean values (looking for instances where negative messages return True)
amazon['score'] = (amazon['score'] == 0)
amazon.head(3)

Unnamed: 0,feedback,score
0,So there is no way for me to plug it in here i...,True
1,"Good case, Excellent value.",False
2,Great for the jawbone.,False


In [7]:
# create list of keywords + "!" 
keywords = list(neg_words.values.flatten())
keywords.append("no")
keywords.append("never")
keywords.append("not")
len(keywords)

4786

In [8]:
#strip punctuations from feedback messages
def strip_punctuation(message):
    from string import punctuation
    return ''.join(m for m in message if m not in punctuation)

# compare two lists to see if feedback contains negative word
def neg_message_check(df,col_name,alist):
    import re
    message_list = list(df[col_name].values.flatten())
    new_message_list = []
    for message in message_list:
        new_message = strip_punctuation(message.lower())
        #escape_message = re.escape(new_message)
        new_message_list.append(new_message)
        
    nm = pd.Series(new_message_list)
    df["modified_feedback"] = nm.values
    
    for key in alist:
        escaped_key = re.escape(key)
        df[str(key)] = df.modified_feedback.str.contains("" + str(escaped_key) + "",case=False)
        #amazon[str(key)] = amazon.modified_feedback.apply(lambda sentence: any(word in sentence for word in alist))

In [9]:
neg_message_check(amazon,"feedback",keywords)

In [10]:
data = amazon[keywords]
target = amazon["score"]

In [11]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

bnb.fit(data,target)

y_pred = bnb.predict(data)

print("Number of mislabeled points out of total {} points: {}".format(data.shape[0],(target != y_pred).sum()))

Number of mislabeled points out of total 1000 points: 218
