In [178]:
import pandas as pd
import sklearn
import nltk
import re

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix, accuracy_score

In [179]:
df = pd.read_csv(r"C:\Users\hp\Downloads\a1_RestaurantReviews_HistoricDump.tsv", sep = '\t', quoting  = 3)

In [180]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [181]:
df.shape

(900, 2)

In [182]:
df['Liked'].value_counts()

1    496
0    404
Name: Liked, dtype: int64

In [183]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
stopwords_list

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [184]:
stopwords_list.remove('no')
stopwords_list.remove('not')

In [185]:
df['Review'][6]

"Honeslty it didn't taste THAT fresh.)"

## Data Preprocessing

In [186]:
# pip install spacy

In [187]:
#Method to remove html tags
from bs4 import BeautifulSoup
def remove_html_tags(text):
    return BeautifulSoup(text, 'html.parser').get_text()

##Case-Standardization
def to_lowercase(text):
    return text.lower()

##Standardizing Accent Characters
import unicodedata
def standardize_accented_chars(text):
 return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

##Dealing with URLs
def remove_url(text):
 return re.sub(r'https?:\S*', '', text)

##Expanding Contractions
import contractions
def expand_contractions(text):
    expanded_words = [] 
    for word in text.split():
       expanded_words.append(contractions.fix(word)) 
    return ' '.join(expanded_words)

##Removing Mentions and Hashtags
def remove_mentions_and_tags(text):
    text = re.sub(r'@\S*', '', text)
    return re.sub(r'#\S*', '', text)
                  
##Removing Special Characters
def remove_special_characters(text):
    # define the pattern to keep
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', text)

##Removing Digits
def remove_numbers(text):
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)
                  
##Removing Puncuations                 
import string
def remove_punctuation(text):
    return ''.join([c for c in text if c not in string.punctuation])
                  
def get_stem(text):
    stemmer = nltk.porter.PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in text.split()])

def get_lemma(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

In [188]:
##Using Porter Stemming
cleaned_reviews1 = []
cleaned_reviews2 = []
# nltk.download('wordnet')
# nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

for review in df['Review']:
    review = remove_html_tags(review)
    review = to_lowercase(review)
    review = standardize_accented_chars(review)
    review = remove_url(review)
    review = expand_contractions(review)
    review = remove_mentions_and_tags(review)
    review = remove_special_characters(review)
    review = remove_numbers(review)
    review = remove_punctuation(review)
    review = get_stem(review)
    
    cleaned_reviews1.append(review)
    
df.insert(1, "cleaned_reviews1", cleaned_reviews1)




In [189]:
##Using Lemmetizer
for review in df['Review']:
    review = remove_html_tags(review)
    review = to_lowercase(review)
    review = standardize_accented_chars(review)
    review = remove_url(review)
    review = expand_contractions(review)
    review = remove_mentions_and_tags(review)
    review = remove_special_characters(review)
    review = remove_numbers(review)
    review = remove_punctuation(review)
    review = get_lemma(review)
    
    cleaned_reviews2.append(review)
df.insert(1, "cleaned_reviews2", cleaned_reviews2)



In [190]:
df.head()

Unnamed: 0,Review,cleaned_reviews2,cleaned_reviews1,Liked
0,Wow... Loved this place.,wow loved this place,wow love thi place,1
1,Crust is not good.,crust is not good,crust is not good,0
2,Not tasty and the texture was just nasty.,not tasty and the texture wa just nasty,not tasti and the textur wa just nasti,0
3,Stopped by during the late May bank holiday of...,stopped by during the late may bank holiday of...,stop by dure the late may bank holiday off ric...,1
4,The selection on the menu was great and so wer...,the selection on the menu wa great and so were...,the select on the menu wa great and so were th...,1


In [191]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1420)

In [192]:
X1 = cv.fit_transform(df['cleaned_reviews1']).toarray()
X2 = cv.fit_transform(df['cleaned_reviews2']).toarray()
y = df.iloc[:, -1].values

In [193]:
print('X1 Shape:', X1.shape)
print('X2 Shape:', X2.shape)
print('y Shape:', y.shape)

X1 Shape: (900, 1420)
X2 Shape: (900, 1420)
y Shape: (900,)


In [194]:
# Saving BoW dictionary to later use in prediction
import pickle
bow_path = 'c1_BoW_Sentiment_Model.pkl'
pickle.dump(cv, open(bow_path, "wb"))

In [195]:
from sklearn.model_selection import train_test_split
X1_train,X1_test,y_train,y_test = train_test_split(X1,y,test_size =0.333, random_state = 42)
X2_train,X2_test,y_train,y_test = train_test_split(X2,y,test_size =0.333, random_state = 42)

In [196]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
classifiers = [GaussianNB(), MultinomialNB(), BernoulliNB(), SVC(), LogisticRegression()]
for classifier in classifiers:
    classifier.fit(X1_train, y_train)
    y_pred = classifier.predict(X1_test)
    print("Accuracy score of {} is:{}".format(classifier, accuracy_score(y_test, y_pred)))
    

Accuracy score of GaussianNB() is:0.7233333333333334
Accuracy score of MultinomialNB() is:0.8033333333333333
Accuracy score of BernoulliNB() is:0.73
Accuracy score of SVC() is:0.7566666666666667
Accuracy score of LogisticRegression() is:0.7733333333333333


In [197]:
for classifier in classifiers:
    classifier.fit(X2_train, y_train)
    y_pred = classifier.predict(X2_test)
    print("Accuracy score of {} is:{}".format(classifier, accuracy_score(y_test, y_pred)))

Accuracy score of GaussianNB() is:0.71
Accuracy score of MultinomialNB() is:0.8033333333333333
Accuracy score of BernoulliNB() is:0.7466666666666667
Accuracy score of SVC() is:0.7433333333333333
Accuracy score of LogisticRegression() is:0.7666666666666667
