# Introduction
The previous stage includes reviews text preprocessing, which is in Preprocess_Reviews.ipynb<br>
The following stage is aim to analysis reviews sentiment

In [167]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import FreqDist
import seaborn as sns
import re
import spacy
nlp = spacy.load('en_core_web_sm')
import string
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import sys
import os
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" 

import matplotlib.pyplot as plt
%matplotlib inline

# Load reviews

In [168]:
file = "Cleaned_Text_Dataset.csv"
df = pd.read_csv(file)
del df["Unnamed: 0"]
df.head()

Unnamed: 0,review_id,business_id,text,target,cleaned_text
0,Gt_h5A-A5Nt50EbaPCorvw,F8yozE3NWnImNApHO347gQ,First impression was good but their food is ho...,0.0,impression good food horrible cash idea over p...
1,0dELc-isYD5Av7KNsIvcRA,WxB8498ejPtHE7wFa89_fA,"I've been to this location countless times, an...",0.0,location countless time time food service ambi...
2,Bmrq4dvmlVMGU3roXzeDgQ,-mIlmp5l4hKlp1tvHRdvTg,Seems popular- but not that delish. Short sub ...,0.0,popular delish short sub maybe typical length ...
3,tCP2EjtzGJ7JEHlj_8i1xw,hcxea89M_U__LADtu3C0kA,The service was great! The place was very beau...,0.0,service great place beautiful small tight rest...
4,3gM_kcsqfU9eqmE19kL4tw,3BJxm-HnvzdwD1zjmSbmyQ,I was at this restaurant recently and was so u...,0.0,restaurant recently unhappy purchase wonton so...


# Tokenization and Bag-of-Words (BoW)

In [None]:
text = df['cleaned_text']
word_tokens = word_tokenize(text)
tokens = list()
for word in word_tokens:
    if word.isalpha() and word not in my_stop_words:
        tokens.append(word)
token_dist = FreqDist(tokens)
dist = pd.DataFrame(token_dist.most_common(20),columns=['Word', 'Frequency'])

为了方便接下来的使用，我需要将dataframe里cleaned_text转为token list

In [160]:
"""
def list_of_token(text):
    return [w for w in text.split()]
df['cleaned_text'] = df['cleaned_text'].apply(lambda text: list_of_token(text))
df
"""

"\ndef list_of_token(text):\n    return [w for w in text.split()]\ndf['cleaned_text'] = df['cleaned_text'].apply(lambda text: list_of_token(text))\ndf\n"

In [161]:
X = df['cleaned_text'] 
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify=y)
X_train.head()

4882    similar reviewer avoid place year simply veget...
4931    happy pop major change work more long table go...
275     month track burger fi like iroquois brave prov...
6592    husband dine wedding anniversary note occasion...
7150    love reading terminal shop fish head pick hoag...
Name: cleaned_text, dtype: object

In [162]:
y_train.head()

4882    1.0
4931    1.0
275     0.0
6592    1.0
7150    1.0
Name: target, dtype: float64

建立评估function，它可以生成confusion matrix的heatmap，直观

In [163]:
def evaluate(y_test, predictions):
    cf_matrix = confusion_matrix(y_test, predictions)
    sns.heatmap(cf_matrix, annot = True, fmt = 'd',cmap="Blues")
    plt.title('Heatmap of confusion matrix for Test data')
    plt.ylabel('True label')
    plt.xlabel('Predicted label') 

# Vectorisation and Validation

列举我们将要实验的n-gram，【摘抄】GridSearchCV是Sklearn model_selection包的一个模块，用于超参数调整。 给定一组不同的超参数，GridSearchCV 循环浏览所有可能的超参数值和组合，并在训练数据集上拟合模型。 在这个过程中，它能够确定产生最佳精度的超参数的最佳值和组合（从给定的参数集中）【摘抄】在机器学习模型中，需要人工选择的参数称为超参数。比如随机森林中决策树的个数，人工神经网络模型中隐藏层层数和每层的节点个数，正则项中常数大小等等，他们都需要事先指定。超参数选择不恰当，就会出现欠拟合或者过拟合的问题。而在选择超参数的时候，有两个途径，一个是凭经验微调，另一个就是选择不同大小的参数，带入模型中，挑选表现最好的参数。微调的一种方法是手工调制超参数，直到找到一个好的超参数组合，这么做的话会非常冗长，你也可能没有时间探索多种组合，所以可以使用Scikit-Learn的GridSearchCV来做这项搜索工作。<br>
这里用的是后者<br>
可以提一下交叉验证，cross validation

In [164]:
param_grid = {'c_vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)]}

### Logistic Regression model 
找出哪个n-gram在逻辑回归模型中表现更好

In [165]:
lr_pipeline = Pipeline([
    ('c_vectorizer', CountVectorizer()),
    ('lr', LogisticRegression(random_state=42))
])

gs_lr = GridSearchCV(lr_pipeline, refit=True, cv=2, param_grid=param_grid, scoring='f1', n_jobs=-1)
gs_lr.fit(X_train, y_train)

print('Optimal n-gram:', gs_lr.best_estimator_.get_params()['c_vectorizer__ngram_range'])
print("最优参数: ", gs_lr.best_params_)
print("最佳性能: ", gs_lr.best_score_)

predictions = gs_lr.predict(X_test)
print(classification_report(y_test, predictions, digits=4))
#evaluate(y_test, predictions)

Optimal n-gram: (1, 2)
最优参数:  {'c_vectorizer__ngram_range': (1, 2)}
最佳性能:  0.9572187853968808
              precision    recall  f1-score   support

         0.0     0.9591    0.9917    0.9751      1088
         1.0     0.9914    0.9577    0.9743      1088

    accuracy                         0.9747      2176
   macro avg     0.9753    0.9747    0.9747      2176
weighted avg     0.9753    0.9747    0.9747      2176



### Support Vector Machine model 
找出哪个n-gram在支持向量机SVM模型中表现更好

In [166]:
svm_pipe = Pipeline([
    ('c_vectorizer', CountVectorizer()),
    ('svm', svm.SVC(max_iter=-1, random_state=42))
])

gs_svm = GridSearchCV(svm_pipe, refit=True, cv=2, param_grid=param_grid, scoring='f1', n_jobs=-1)
gs_svm.fit(X_train, y_train)

print('Best ngram_range:', gs_svm.best_estimator_.get_params()['c_vectorizer__ngram_range'])
print("最优参数: ", gs_svm.best_params_)
print("最佳性能: ", gs_svm.best_score_)

predictions = gs_svm.predict(X_test)
print(classification_report(y_test, predictions, digits=4))
#evaluate(y_test, predictions)

Best ngram_range: (1, 2)
最优参数:  {'c_vectorizer__ngram_range': (1, 2)}
最佳性能:  0.9494188226471691
              precision    recall  f1-score   support

         0.0     0.9662    0.9724    0.9693      1088
         1.0     0.9722    0.9660    0.9691      1088

    accuracy                         0.9692      2176
   macro avg     0.9692    0.9692    0.9692      2176
weighted avg     0.9692    0.9692    0.9692      2176



对比上面逻辑回归和SVM交叉验证的结果，逻辑回归的最佳性能更好，因此我们选择最佳性能更好的逻辑回归结果，它的最优参数n-gram是（1，2）<br>
接下来使用逻辑回归和

In [None]:
grid = {"lr__C":[1, 10, 20, 30, 40], 'lr__l1_ratio':['none', 0, 0.1], 'lr__penalty':['l2', 'elasticnet'],
       'lr__solver':['lbfgs', 'saga']}

# Pipeline for TfidfVectorizer - with the best ngram_range - and Logistic Regression Classifier 
lr_pipe_tfidf_2 = Pipeline([
 ('tfidf', TfidfVectorizer(ngram_range = (1,2))),
 ('lr', LogisticRegression(max_iter=1500, n_jobs = -1, random_state=42))
])

logreg_cv_3=GridSearchCV(lr_pipe_tfidf_2, grid, scoring='f1', cv=2)
logreg_cv_3.fit(X_train, y_train)

print('Best l1_ratio:', logreg_cv_3.best_estimator_.get_params()['lr__l1_ratio'])
print('Best C:', logreg_cv_3.best_estimator_.get_params()['lr__C'])
print('Best penalty:', logreg_cv_3.best_estimator_.get_params()['lr__penalty'])
print('Best solver:', logreg_cv_3.best_estimator_.get_params()['lr__solver'])

predictions = logreg_cv_3.predict(X_test)
evaluate(y_test, predictions )

### Using TF_IDF
【摘抄】we can use TF_IDF vectorizing to find the weighted words that occur more frequently in the document that leads to creation of the bag of words model我们可以使用 TF_IDF 向量化来找到文档中出现频率更高的加权词，从而创建词袋模型