In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

from collections import defaultdict

In [2]:
import nltk
import random
#from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [40]:
# fonction for cleaning data

In [3]:
def show():
    sns.despine()
    plt.show()
    
def clean_format(w):
    w = w.lower().replace('.', '').replace(',', '').replace('!', '')
    #.replace('+', '').replace('(', '').replace(')', '')
    return w

In [4]:
yelp_df = pd.read_csv('yelp_review1.csv',nrows =10000)
yelp_df.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0


In [5]:
print(list(yelp_df))

['review_id', 'user_id', 'business_id', 'stars', 'date', 'text', 'useful', 'funny', 'cool']


In [41]:
# choose text and star two colmns

In [6]:
documents = [(t, star) for t,star in zip(yelp_df['text'], yelp_df['stars'])]
save_documents = open('documents.pickle',"wb")
pickle.dump(documents, save_documents)
save_documents.close()

In [42]:
# delete stopwords

In [7]:
from nltk.corpus import stopwords
 
data = "This is nonsense. I hate this place. The food is bad and the service is terrible"
stopWords = list(set(stopwords.words('english')))

# stopWords[:5]
print(' '.join([w for w in data.split() if w not in stopWords ]))

This nonsense. I hate place. The food bad service terrible


In [43]:
# Use the stemmer to stem all the words
ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

print([ps.stem(w) for w in example_words])

['python', 'python', 'python', 'python', 'pythonli']


In [None]:
# Extract all occurrences of words and record the number of occurrences

In [9]:
all_words = []

for (t, star) in documents:
    for word in t.split():
        w = clean_format(word)
        all_words.append(ps.stem(w))
        
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))
print("good appeared: " + str(all_words['good']) + " times")

[('the', 65354), ('and', 44737), ('a', 36044), ('i', 34420), ('to', 30637), ('wa', 22561), ('of', 20322), ('it', 18047), ('is', 15896), ('for', 15601), ('in', 14641), ('with', 11745), ('my', 11421), ('that', 11393), ('but', 10306)]
good appeared: 5827 times


In [10]:
words_features = [s for (s,_) in list(all_words.most_common(5000))]

In [11]:
print(words_features[:10])

['the', 'and', 'a', 'i', 'to', 'wa', 'of', 'it', 'is', 'for']


In [12]:
def find_features(document):
    words = set(document.split())
    features = {}
    for w in words:
        w = clean_format(w)
        w = ps.stem(w)
        features[w] = (w in words_features)
    return features

featuresets = [(find_features(doc), star) for (doc,star) in documents]

print(featuresets[0])

({'a': True, 'but': True, "30'": False, 'help': True, 'bologna': False, 'sinc': True, 'veri': True, 'simpl': True, 'been': True, 'around': True, 'with:': False, 'amaz': True, 'mustard': True, 'serv': True, 'salami': True, 'with': True, "it'": True, 'the': True, 'they': True, 'nonetheless': True, 'friendli': True, 'wa': True, 'staff': True, 'place': True, 'super': True, 'same': True, 'thing': True, 'start': True, 'still': True, 'and': True, 'sandwich': True}, 5)


In [13]:
len(featuresets)

10000

In [14]:
# POS to NEG encode
temp = []
pos_count = 0
for i in range(len(featuresets)):
    if featuresets[i][1] >= 4:
        temp.append((featuresets[i][0], 'pos'))
        pos_count = pos_count + 1
    elif featuresets[i][1] <= 2:
        temp.append((featuresets[i][0], 'neg'))
print("pos rate: ", pos_count / len(temp))
print("len of temp: ", len(temp))

pos rate:  0.7870593915982617
len of temp:  8284


In [15]:
training_set = temp[:6000]
testing_set = temp[6000:]

print(len(training_set))
print(len(testing_set))

6000
2284


In [16]:
np.random.seed(4747)

clf = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Classifier accuracy: ", nltk.classify.accuracy(clf, testing_set) * 100)
clf.show_most_informative_features(15)

Naive Bayes Classifier accuracy:  42.16287215411559
Most Informative Features
                   worst = True              neg : pos    =     45.1 : 1.0
                 disgust = True              neg : pos    =     44.2 : 1.0
              underwhelm = True              neg : pos    =     31.2 : 1.0
                   appal = True              neg : pos    =     28.7 : 1.0
                 horrend = True              neg : pos    =     28.7 : 1.0
                  poorli = True              neg : pos    =     27.7 : 1.0
            unprofession = True              neg : pos    =     27.7 : 1.0
              condescend = True              neg : pos    =     26.2 : 1.0
                 downhil = True              neg : pos    =     24.7 : 1.0
                    ined = True              neg : pos    =     20.2 : 1.0
                    wors = True              neg : pos    =     19.7 : 1.0
                  crappi = True              neg : pos    =     18.7 : 1.0
                   cro

In [17]:
np.random.seed(4747)

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

###############
save_classifier = open('originalnaivebayes5k.pickle',"wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

save_classifier = open("MNB_classifier5k.pickle","wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

save_classifier = open("BernoulliNB_classifier5k.pickle","wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

save_classifier = open("LogisticRegression_classifier5k.pickle","wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

save_classifier = open("LinearSVC_classifier5k.pickle","wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()

PolySVC_classifier = SklearnClassifier(SVC(kernel = 'poly'))
PolySVC_classifier.train(training_set)
print("PolySVC_classifier accuracy percent:", (nltk.classify.accuracy(PolySVC_classifier, testing_set))*100)

save_classifier = open("PolySVC_classifier5k.pickle","wb")
pickle.dump(PolySVC_classifier, save_classifier)
save_classifier.close()

RadialSVC_classifier = SklearnClassifier(SVC(kernel = 'rbf'))
RadialSVC_classifier.train(training_set)
print("RadialSVC_classifier accuracy percent:", (nltk.classify.accuracy(RadialSVC_classifier, testing_set))*100)

save_classifier = open("RadialSVC_classifier5k.pickle","wb")
pickle.dump(RadialSVC_classifier, save_classifier)
save_classifier.close()

#NuSVC_classifier = SklearnClassifier(NuSVC())
#NuSVC_classifier.train(training_set)
#print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

#save_classifier = open("NuSVC_classifier5k.pickle","wb")
#pickle.dump(LinearSVC_classifier, save_classifier)
#save_classifier.close()

SGDC_classifier = SklearnClassifier(SGDClassifier())
SGDC_classifier.train(training_set)
print("SGDClassifier accuracy percent:",nltk.classify.accuracy(SGDC_classifier, testing_set)*100)

save_classifier = open("SGDC_classifier5k.pickle","wb")
pickle.dump(SGDC_classifier, save_classifier)
save_classifier.close()

Original Naive Bayes Algo accuracy percent: 42.16287215411559
Most Informative Features
                   worst = True              neg : pos    =     45.1 : 1.0
                 disgust = True              neg : pos    =     44.2 : 1.0
              underwhelm = True              neg : pos    =     31.2 : 1.0
                   appal = True              neg : pos    =     28.7 : 1.0
                 horrend = True              neg : pos    =     28.7 : 1.0
                  poorli = True              neg : pos    =     27.7 : 1.0
            unprofession = True              neg : pos    =     27.7 : 1.0
              condescend = True              neg : pos    =     26.2 : 1.0
                 downhil = True              neg : pos    =     24.7 : 1.0
                    ined = True              neg : pos    =     20.2 : 1.0
                    wors = True              neg : pos    =     19.7 : 1.0
                  crappi = True              neg : pos    =     18.7 : 1.0
            

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression_classifier accuracy percent: 91.24343257443083




LinearSVC_classifier accuracy percent: 89.88616462346761
PolySVC_classifier accuracy percent: 83.53765323992994
RadialSVC_classifier accuracy percent: 89.31698774080562
SGDClassifier accuracy percent: 90.06129597197898


In [18]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        chosen_class = votes.count(mode(votes))
        return chosen_class / len(votes)
    
voted_classifier = VoteClassifier(MNB_classifier,
                                  LogisticRegression_classifier, 
                                  LinearSVC_classifier) 
#                                   MNB_classifier, 
#                                   NuSVC_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

voted_classifier accuracy percent: 90.8493870402802


In [36]:
def sentiment(text):
    feats = find_features(text)
    return voted_classifier.classify(feats),voted_classifier.confidence(feats)

In [37]:
sentiment("This place is awesome! Definitely recommend it to friends. ")

('pos', 1.0)

In [38]:
sentiment("This restaurant is terrible. The service was bad. The food was not delicious.")

('neg', 0.6666666666666666)