# Introduction
The previous stage includes reviews text preprocessing, which is in Process_Reviews.ipynb<br>
The following stage is aim to analysis reviews sentiment

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
import seaborn as sns
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from gensim.models import Word2Vec
import spacy
nlp = spacy.load('en_core_web_sm')
import string
import sys
import os
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" 

import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\AA\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load reviews

In [4]:
file = "../Datasets/Cleaned_Text_Dataset.csv"
df = pd.read_csv(file)
del df["Unnamed: 0"]
df.head()

Unnamed: 0,review_id,business_id,text,target,words
0,fxWnU4OqONBNoQhEcyazSg,krTHKI0YOpASr4gz2CVWFw,"This location used to be good, several years a...",0.0,this location use to be good several year ago ...
1,FhtER9SGsEYkEhRcs09rsQ,krTHKI0YOpASr4gz2CVWFw,I love Cosi but this Cosi is going down hill f...,0.0,love cosi but this cosi be go down hill fast a...
2,0KlwfaHZyvao41_3S47dyg,w9hS5x1F52Id-G1KTrAOZg,Was not a fan of their cheesesteak. Their wiz ...,0.0,be not fan of their cheesesteak their wiz sauc...
3,2qeje7dttkvREbccHev6Pg,7lwe7n-Yc-V9E_HfLAeylg,"It pains me to write this, but I fear I must.....",0.0,it pain to write this but fear must use to rea...
4,1OR23O0giNcxNbFAi4jgcg,DsKzHnkLKnxZTVsFpts4oA,Cocktails were nice however the bartender Paul...,0.0,cocktail be nice however the bartender paul be...


# Sentiment polarity

我们将情绪分为四个类别，negative、neutral、positive and compound。【摘抄】The first three are easy to understand and for the compound score, it is a combination of positive and negative scores and ranges from -1 to 1: below 0 is negative and above 0 is positive. I am going to use the compound score to measure the sentiment.前三个很容易理解，对于复合分数，它是正分数和负分数的组合，范围从 -1 到 1：低于 0 为负，高于 0 为正。我将使用复合分数来衡量情绪。

In [5]:
# Instantiate new SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
# Generate sentiment scores
sentiment_scores = df['words'].apply(sid.polarity_scores)
sentiment = sentiment_scores.apply(lambda x: x['compound'])
print(df['words'][0])
print(sentiment_scores.head())

this location use to be good several year ago about year ago it start to go downhill and now it be just terrible there be people work on saturday evening nearly all the table be full and there be people wait to order and to pay the sandwich that be make for be on hard bread burn and barely edible they need to either shape up or ship out
0    {'neg': 0.08, 'neu': 0.881, 'pos': 0.039, 'com...
1    {'neg': 0.253, 'neu': 0.702, 'pos': 0.046, 'co...
2    {'neg': 0.11, 'neu': 0.813, 'pos': 0.077, 'com...
3    {'neg': 0.22, 'neu': 0.62, 'pos': 0.16, 'compo...
4    {'neg': 0.113, 'neu': 0.734, 'pos': 0.153, 'co...
Name: words, dtype: object


# Split training set and testing set

In [6]:
X = df['words'] 
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify=y)
print("X_train:\n", X_train.head())
print("\ny_train:\n", y_train.head())

X_train:
 1833    have read the review that this remind someone ...
3046    sit by the huge window you have somewhat of ni...
5958    tatiana the hostess sit we at wonderful table ...
8688    really like this place it be one of the good p...
2672    let set scene for you it be thursday and my bi...
Name: words, dtype: object

y_train:
 1833    0.0
3046    0.0
5958    1.0
8688    1.0
2672    0.0
Name: target, dtype: float64


# Cross-Validation

列举我们将要实验的n-gram，【摘抄】GridSearchCV是Sklearn model_selection包的一个模块，用于超参数调整。 给定一组不同的超参数，GridSearchCV 循环浏览所有可能的超参数值和组合，并在训练数据集上拟合模型。 在这个过程中，它能够确定产生最佳精度的超参数的最佳值和组合（从给定的参数集中）【摘抄】在机器学习模型中，需要人工选择的参数称为超参数。比如随机森林中决策树的个数，人工神经网络模型中隐藏层层数和每层的节点个数，正则项中常数大小等等，他们都需要事先指定。超参数选择不恰当，就会出现欠拟合或者过拟合的问题。而在选择超参数的时候，有两个途径，一个是凭经验微调，另一个就是选择不同大小的参数，带入模型中，挑选表现最好的参数。微调的一种方法是手工调制超参数，直到找到一个好的超参数组合，这么做的话会非常冗长，你也可能没有时间探索多种组合，所以可以使用Scikit-Learn的GridSearchCV来做这项搜索工作。<br>
这里用的是后者<br>
可以提一下交叉验证，cross validation

In [7]:
param_grid = {'c_vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)]}

### Bag-of-words model(wordcounts) and Vectorisation

### Logistic Regression model 
找出哪个n-gram在逻辑回归模型中表现更好

In [8]:
lr_pipeline = Pipeline([
    ('c_vectorizer', CountVectorizer()),
    ('lr', LogisticRegression(random_state=42))
])

gs_lr = GridSearchCV(lr_pipeline, refit=True, cv=2, param_grid=param_grid, scoring='f1', n_jobs=-1)
gs_lr.fit(X_train, y_train)

print('optimal n-gram: ', gs_lr.best_estimator_.get_params()['c_vectorizer__ngram_range'])
print("optimal parameter: ", gs_lr.best_params_)
print("optimal score: ", gs_lr.best_score_)

print('classification report')
predictions = gs_lr.predict(X_test)
report = classification_report(y_test, predictions, digits=4, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report

optimal n-gram:  (1, 2)
optimal parameter:  {'c_vectorizer__ngram_range': (1, 2)}
optimal score:  0.9185685098212045
classification report


Unnamed: 0,precision,recall,f1-score,support
0.0,0.942963,0.929197,0.936029,1370.0
1.0,0.930166,0.943755,0.936911,1369.0
accuracy,0.936473,0.936473,0.936473,0.936473
macro avg,0.936564,0.936476,0.93647,2739.0
weighted avg,0.936567,0.936473,0.93647,2739.0


### Support Vector Machine model 
找出哪个n-gram在支持向量机SVM模型中表现更好

In [9]:
svm_pipe = Pipeline([
    ('c_vectorizer', CountVectorizer()),
    ('svm', svm.SVC(max_iter=-1, random_state=42))
])

gs_svm = GridSearchCV(svm_pipe, refit=True, cv=2, param_grid=param_grid, scoring='f1', n_jobs=-1)
gs_svm.fit(X_train, y_train)

print('optimal n-gram: ', gs_svm.best_estimator_.get_params()['c_vectorizer__ngram_range'])
print("optimal parameter: ", gs_svm.best_params_)
print("optimal score: ", gs_svm.best_score_)

print('classification report')
predictions = gs_svm.predict(X_test)
report = classification_report(y_test, predictions, digits=4, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report

optimal n-gram:  (1, 2)
optimal parameter:  {'c_vectorizer__ngram_range': (1, 2)}
optimal score:  0.8775782788523114
classification report


Unnamed: 0,precision,recall,f1-score,support
0.0,0.90386,0.905839,0.904849,1370.0
1.0,0.905564,0.903579,0.90457,1369.0
accuracy,0.90471,0.90471,0.90471,0.90471
macro avg,0.904712,0.904709,0.90471,2739.0
weighted avg,0.904712,0.90471,0.90471,2739.0


【这里要改】对比上面逻辑回归和SVM交叉验证的结果，逻辑回归的最佳性能更好，因此我们选择最佳性能更好的逻辑回归结果，它的最优参数n-gram是（1，2）

# Supervised Learning Classifiers

In [10]:
print("X_train:\n", X_train.head())
print("\ny_train:\n", y_train.head())

X_train:
 1833    have read the review that this remind someone ...
3046    sit by the huge window you have somewhat of ni...
5958    tatiana the hostess sit we at wonderful table ...
8688    really like this place it be one of the good p...
2672    let set scene for you it be thursday and my bi...
Name: words, dtype: object

y_train:
 1833    0.0
3046    0.0
5958    1.0
8688    1.0
2672    0.0
Name: target, dtype: float64


### Bag-of-words model(TF-IDF) and Vectorisation
【摘抄】we can use TF_IDF vectorizing to find the weighted words that occur more frequently in the document that leads to creation of the bag of words model我们可以使用 TF_IDF 向量化来找到文档中出现频率更高的加权词，从而创建词袋模型

In [11]:
# Create a list of stop words with stopwords library 
# and adding extra stopwords that is not potentially useful 
my_stop_words = set(stopwords.words('english') + 
                    list(ENGLISH_STOP_WORDS) + 
                    ['super', 'duper', 've', 'like', 'got', 
                     'Cleveland', 'just', 'don', 'really', 
                     'said', 'told', 'ok','came', 'went', 
                     'did', 'didn', 'good'])
#exclude_stopwords = ['no','none']
#for word in exclude_stopwords:
#    my_stop_words.remove(word)

In [12]:
def my_tokenizer(sentence):
    # to remove any space from beginning and the end of text
    listofwords = sentence.strip().split()
    listof_words = []    
    for word in listofwords:
        if not word in my_stop_words:
            lemm_word = WordNetLemmatizer().lemmatize(word)
            # remove the stop words
            for punctuation_mark in string.punctuation:
                word = word.replace(punctuation_mark, '').lower()
            if len(word)>0:
                listof_words.append(word)
    return listof_words

从上面cross-validation得到最优的n-gram的结果(1,2)，在这里使用

In [13]:
vect_1 = TfidfVectorizer(min_df=100,
                         tokenizer=my_tokenizer,
                         stop_words=list(my_stop_words), 
                         ngram_range=(1,2)).fit(X_train)
X_train1 = vect_1.transform(X_train)
X_test1 = vect_1.transform(X_test)
# the below line for future coeff
X_train1_df = pd.DataFrame(X_train1.toarray(), columns=vect_1.get_feature_names_out())

In [14]:
new_df_words = pd.DataFrame(columns=vect_1.get_feature_names_out(), data=X_train1.toarray())
new_df_words

Unnamed: 0,able,absolutely,actually,add,ago,amazing,ambiance,apologize,app,appetizer,...,work,worth,wow,wrap,write,wrong,year,yelp,yes,yummy
0,0.0,0.0,0.131165,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.114851,0.120613,0.0,0.0,0.0,0.134618,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6386,0.0,0.0,0.000000,0.000000,0.514717,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
6387,0.0,0.0,0.000000,0.184572,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.154578,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
6388,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
6389,0.0,0.0,0.113224,0.000000,0.000000,0.0,0.0,0.144276,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.348614,0.0,0.0,0.0,0.0


【可以转csv，用作推荐】

In [15]:
#counting most repetitive words 
word_counts = np.array(np.sum(X_train1, axis=0)).reshape((-1,))
words = np.array(vect_1.get_feature_names_out())
words_df = pd.DataFrame({"words":words, "counts":word_counts})
words_df.sort_values(by="counts",ascending=False).head(15)

Unnamed: 0,words,counts
172,food,371.535738
357,place,337.507768
332,order,280.974762
194,great,253.023882
498,time,236.286287
83,come,234.206358
430,service,223.709631
284,make,189.489262
397,restaurant,175.841039
510,try,169.702436


### Logistic Regression model

In [16]:
# Fitting Logistic regression to the training set
logreg = LogisticRegression(solver='lbfgs',multi_class='auto',random_state=1)
logreg.fit(X_train1, y_train)

# Predicting the test set results
y_pred_logreg = logreg.predict(X_test1)

# Training score
print(f"Score on training set: {logreg.score(X_train1,y_train)}")
print(f"Score on test set: {logreg.score(X_test1,y_test)}")

Score on training set: 0.9145673603504929
Score on test set: 0.8981380065717415


In [17]:
print('confusion matrix')
con_mat_lr = confusion_matrix(y_test, y_pred_logreg)
df_cm_lr = pd.DataFrame(con_mat_lr, columns = ['Predicted 0','Predicted 1'], index = ['True 0','True 1'])
display(df_cm_lr)
print('classification report')
report = classification_report(y_test, y_pred_logreg, output_dict=True)
df_report = pd.DataFrame(report).transpose()
df_report

confusion matrix


Unnamed: 0,Predicted 0,Predicted 1
True 0,1213,157
True 1,122,1247


classification report


Unnamed: 0,precision,recall,f1-score,support
0.0,0.908614,0.885401,0.896858,1370.0
1.0,0.888177,0.910884,0.899387,1369.0
accuracy,0.898138,0.898138,0.898138,0.898138
macro avg,0.898395,0.898143,0.898122,2739.0
weighted avg,0.898399,0.898138,0.898122,2739.0


In [18]:
# find the most informative words
log_odds = logreg.coef_[0]
coeff = pd.DataFrame(log_odds, X_train1_df.columns, columns=['coef']).sort_values(by='coef', ascending=False)
coeff

Unnamed: 0,coef
delicious,6.107103
amazing,5.101902
love,4.924239
great,4.687776
definitely,3.929150
...,...
horrible,-3.660362
bland,-3.683585
terrible,-3.789638
disappointing,-4.170694


### Naive Bayes model

# Testing Sentiment Classifer

In [19]:
def remove_special(text):
    # remove the URL
    text = re.sub(r"http\S+", "", text)
    # remove mentions
    text = re.sub("@[^\s]*", "", text)
    # remove hashtags
    text = re.sub("#[^\s]*", "", text)
    
    return text

nlp = spacy.load('en_core_web_sm')
def spacy_process(text):
    doc = nlp(text)
    # Lemmatization with Spacy
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
            
    #Filter the stopwords, remove non-letters and lower case 
    #filtered_sentence =[]
    #for word in lemma_list:
        #lexeme = nlp.vocab[word]
        #if lexeme.is_stop == False:
            #filtered_sentence.append(word)
    lower_words = []
    for word in lemma_list:
        filtered_aplha_char = re.sub("[^\w]" , " " , word)
        filtered_single_n = re.sub("n\sn", " " ,filtered_aplha_char)
        text_letters_only = re.sub("[^a-zA-Z]", " ", filtered_single_n)
        text_words_lower = text_letters_only.lower()
        remove_single_char = re.sub(r'\b\w\b', '', text_words_lower)
        lower_words.append(remove_single_char)
    text_final = " ".join(lower_words)
    return  " ".join(text_final.split())


def remove_stopwords(text):
    # Stopwords
    my_stop_words = set(stopwords.words('english') + 
                        list(ENGLISH_STOP_WORDS) + 
                        ['super', 'duper', 'place'])
    exclude_stopwords = ['no','none']
    for word in exclude_stopwords:
        my_stop_words.remove(word)
    
    word_tokens = word_tokenize(text)
    tokens_list = list()
    for word in word_tokens:
        if word.isalpha() and word not in my_stop_words:
            tokens_list.append(word)
    return tokens_list

def clean_data(text, needed_format):
    text = remove_special(text)
    words_sentence = spacy_process(text)
    tokens_list = remove_stopwords(words_sentence)
    if needed_format == 'list':
        return tokens_list
    elif needed_format == 'string':
        return words_sentence

In [20]:
# read the restaurant info file
file = "../Datasets/Filtered_Restaurant_Dataset.csv"
restaurant_df = pd.read_csv(file)
restaurant_df.head(1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,general_category
0,hSbwd-VP4THYYvSKQQr6Ow,George's Famous Roast Pork and Beef,1007 S 9th St,Philadelphia,PA,19147.0,39.937345,-75.158118,4.0,27,0,"{'RestaurantsReservations': 'False', 'Restaura...","Restaurants, Delis","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'...",Restaurants


In [21]:
def testing_model(res_review):
    review_bid = res_review['business_id']
    review_rid = res_review['review_id']
    review_res = restaurant_df[restaurant_df['business_id'] == review_bid]
    review_res = review_res.reset_index(drop=True)
    review = res_review['text']
    review_cleaned = res_review['words']
    review_res_name = review_res['name'][0]
    print('Restaurant : ' + review_res_name)
    print('-'*100)
    print("Original review is:\n", review)
    print("\nCleaned review is:\n", review_cleaned)
    result = logreg.predict(vect_1.transform([review_cleaned]))
    print('-'*100)
    print("\nLogistic Regression model: ", result)
    if result == 0:
        print("\nThis review has negetive sentiment\n")
    elif result == 1:
        print("\nThis review has positive sentiment\n")

In [22]:
sample_res_review = df.loc[5111]
testing_model(sample_res_review)
sample_res_review['target']

Restaurant : Horizons
----------------------------------------------------------------------------------------------------
Original review is:
 Ok. Now that was amazing. Cucumber avocado soup, vietnamese tempeh tacos, Pan seared tofu, soy cheesecake and wine from the southern hemisphere. Highly recommended

Cleaned review is:
 ok now that be amazing cucumber avocado soup vietnamese tempeh tacos pan sear tofu soy cheesecake and wine from the southern hemisphere highly recommend
----------------------------------------------------------------------------------------------------

Logistic Regression model:  [1.]

This review has positive sentiment



1.0