In [1]:
# For sentiment analysis, I will attempt to produce the best sentiment classifier that is able to classify the comments 
# appropriately. The metric that I will use for the classification task is precision 
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# df contains unlemmatized reviews
df = pd.read_pickle("new_df.pkl")
# df2 contains lemmatized reviews
df2 = pd.read_pickle("new_df2.pkl")

In [2]:
df

Unnamed: 0,review_num,review,rating
0,Review1,always love to visit here there are so many op...,5.0
1,Review2,a nice tourist attraction to visit and enjoy l...,4.0
2,Review3,good place for breakfast especially those who ...,4.0
3,Review4,great fun and interesting food some not to my ...,4.0
4,Review5,so good best butter chicken and naan amazing...,5.0
...,...,...,...
1674,Review1675,we really like this old hawker center unfortu...,3.0
1675,Review1676,it was still under renovation during my visit ...,3.0
1676,Review1677,went to best satay number on boon tat st exce...,5.0
1677,Review1678,is the place for satay at promptly pm the st...,5.0


In [3]:
# I will be creating a DTM matrix using the latest df and df2 to ensure that everything is up to date. The 2 DTM are produced
# after the stop words are removed

cv = CountVectorizer(stop_words = 'english')
data_cv = cv.fit_transform(df['review'])

cv2 = CountVectorizer(stop_words = 'english')
data_cv2 = cv2.fit_transform(df2['review'])


In [4]:
# Only run this code chunk if you want to view the accuracy and f1 scores when stop words not removed (only applicable for 
# Naive Bayes). If this code chunk is to be run, do comment out the codes in code chunk 2

# cv = CountVectorizer()
# data_cv = cv.fit_transform(df['review'])

# cv2 = CountVectorizer()
# data_cv2 = cv2.fit_transform(df2['review'])

In [5]:
# This will be the continuation of the pre-processing before conducting the sentiment analysis
# Now that I have the ratings, I want another column that showcases the satisfactory level. In this case, when rating >= 2.5,
# the satisfactory level will be given a value of 1 and 0 otherwise.
def sentiment(x):
    if x >= 2.5:
        return 1
    else:
        return 0
df['true_satisfactory_level'] = df['rating'].apply(sentiment)
df2['true_satisfactory_level'] = df2['rating'].apply(sentiment)
df = df.drop(['rating'], axis = 1)
df2 = df2.drop(['rating'], axis = 1)

# note that df_nb and df2_nb are used to train the MultinomialNB model
df_nb = df.copy()
df2_nb = df2.copy()

df

Unnamed: 0,review_num,review,true_satisfactory_level
0,Review1,always love to visit here there are so many op...,1
1,Review2,a nice tourist attraction to visit and enjoy l...,1
2,Review3,good place for breakfast especially those who ...,1
3,Review4,great fun and interesting food some not to my ...,1
4,Review5,so good best butter chicken and naan amazing...,1
...,...,...,...
1674,Review1675,we really like this old hawker center unfortu...,1
1675,Review1676,it was still under renovation during my visit ...,1
1676,Review1677,went to best satay number on boon tat st exce...,1
1677,Review1678,is the place for satay at promptly pm the st...,1


In [6]:
df_nb

Unnamed: 0,review_num,review,true_satisfactory_level
0,Review1,always love to visit here there are so many op...,1
1,Review2,a nice tourist attraction to visit and enjoy l...,1
2,Review3,good place for breakfast especially those who ...,1
3,Review4,great fun and interesting food some not to my ...,1
4,Review5,so good best butter chicken and naan amazing...,1
...,...,...,...
1674,Review1675,we really like this old hawker center unfortu...,1
1675,Review1676,it was still under renovation during my visit ...,1
1676,Review1677,went to best satay number on boon tat st exce...,1
1677,Review1678,is the place for satay at promptly pm the st...,1


In [7]:
df2_nb

Unnamed: 0,review_num,review,true_satisfactory_level
0,Review1,always love to visit here there be so many opt...,1
1,Review2,a nice tourist attraction to visit and enjoy l...,1
2,Review3,good place for breakfast especially those who ...,1
3,Review4,great fun and interest food some not to my tas...,1
4,Review5,so good best butter chicken and naan amazingly...,1
...,...,...,...
1674,Review1675,we really like this old hawker center unfortun...,1
1675,Review1676,it be still under renovation during my visit i...,1
1676,Review1677,go to best satay number on boon tat st excelle...,1
1677,Review1678,be the place for satay at promptly pm the stre...,1


In [8]:
# For sentiment Analysis, I will compare the results obtained using TextBlob and a statistical method (Naive Bayes)
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity
# df3 = df.copy()
# polarity lies between -1 and 1, -1 defines negative sentiment and 1 defines positive sentiment
df['polarity'] = df['review'].apply(pol)
df['subjectivity'] = df['review'].apply(sub)
df2['polarity'] = df2['review'].apply(pol)
df2['subjectivity'] = df2['review'].apply(sub)
df

Unnamed: 0,review_num,review,true_satisfactory_level,polarity,subjectivity
0,Review1,always love to visit here there are so many op...,1,0.233333,0.266667
1,Review2,a nice tourist attraction to visit and enjoy l...,1,0.208571,0.520000
2,Review3,good place for breakfast especially those who ...,1,0.266667,0.575000
3,Review4,great fun and interesting food some not to my ...,1,0.602000,0.566000
4,Review5,so good best butter chicken and naan amazing...,1,0.560000,0.690000
...,...,...,...,...,...
1674,Review1675,we really like this old hawker center unfortu...,1,0.050000,0.341667
1675,Review1676,it was still under renovation during my visit ...,1,0.511429,0.674286
1676,Review1677,went to best satay number on boon tat st exce...,1,0.400000,0.655000
1677,Review1678,is the place for satay at promptly pm the st...,1,0.040625,0.531250


In [9]:
df2

Unnamed: 0,review_num,review,true_satisfactory_level,polarity,subjectivity
0,Review1,always love to visit here there be so many opt...,1,0.285714,0.357143
1,Review2,a nice tourist attraction to visit and enjoy l...,1,0.208571,0.520000
2,Review3,good place for breakfast especially those who ...,1,0.266667,0.575000
3,Review4,great fun and interest food some not to my tas...,1,0.627500,0.582500
4,Review5,so good best butter chicken and naan amazingly...,1,0.560000,0.690000
...,...,...,...,...,...
1674,Review1675,we really like this old hawker center unfortun...,1,0.060000,0.330000
1675,Review1676,it be still under renovation during my visit i...,1,0.511429,0.674286
1676,Review1677,go to best satay number on boon tat st excelle...,1,0.350000,0.593750
1677,Review1678,be the place for satay at promptly pm the stre...,1,0.040625,0.531250


In [10]:
def predicted(x):
    if x >= 0: 
        return 1
    else:
        return 0
df['predicted_satisfactory_level'] = df['polarity'].apply(predicted)
df2['predicted_satisfactory_level'] = df2['polarity'].apply(predicted)
df = df.drop(['polarity','subjectivity'], axis = 1)
df2 = df2.drop(['polarity','subjectivity'], axis = 1)

In [11]:
df

Unnamed: 0,review_num,review,true_satisfactory_level,predicted_satisfactory_level
0,Review1,always love to visit here there are so many op...,1,1
1,Review2,a nice tourist attraction to visit and enjoy l...,1,1
2,Review3,good place for breakfast especially those who ...,1,1
3,Review4,great fun and interesting food some not to my ...,1,1
4,Review5,so good best butter chicken and naan amazing...,1,1
...,...,...,...,...
1674,Review1675,we really like this old hawker center unfortu...,1,1
1675,Review1676,it was still under renovation during my visit ...,1,1
1676,Review1677,went to best satay number on boon tat st exce...,1,1
1677,Review1678,is the place for satay at promptly pm the st...,1,1


In [12]:
df2

Unnamed: 0,review_num,review,true_satisfactory_level,predicted_satisfactory_level
0,Review1,always love to visit here there be so many opt...,1,1
1,Review2,a nice tourist attraction to visit and enjoy l...,1,1
2,Review3,good place for breakfast especially those who ...,1,1
3,Review4,great fun and interest food some not to my tas...,1,1
4,Review5,so good best butter chicken and naan amazingly...,1,1
...,...,...,...,...
1674,Review1675,we really like this old hawker center unfortun...,1,1
1675,Review1676,it be still under renovation during my visit i...,1,1
1676,Review1677,go to best satay number on boon tat st excelle...,1,1
1677,Review1678,be the place for satay at promptly pm the stre...,1,1


In [13]:
# Below are the metrics when TextBlob is used
from sklearn.metrics import f1_score
print("Precision score for unlemmatized data is " + str(precision_score(df['true_satisfactory_level'], df['predicted_satisfactory_level'])))
print("Recall score for unlemmatized data is " + str(recall_score(df['true_satisfactory_level'], df['predicted_satisfactory_level'])))
print("F1 score for unlemmatized data is " + str(f1_score(df['true_satisfactory_level'], df['predicted_satisfactory_level'])))
print("")
print("Precision score for lemmatized data is " + str(precision_score(df2['true_satisfactory_level'], df2['predicted_satisfactory_level'])))
print("Recall score for lemmatized data is " + str(recall_score(df2['true_satisfactory_level'], df2['predicted_satisfactory_level'])))
print("F1 score for lemmatized data is " + str(f1_score(df2['true_satisfactory_level'], df2['predicted_satisfactory_level'])))
# We can see that TextBlob does quite a good job at classifying the sentiments of the people who posted a review on Lau Pa Sat.
# since it has quite a high f1 score. It seems that not lemmatizing the words in the sentences was a good idea.

# Precision score for unlemmatized data is 0.9686299615877081
# Recall score for unlemmatized data is 0.9515723270440252
# F1 score for unlemmatized data is 0.9600253807106599

# Precision score for lemmatized data is 0.9688109161793372
# Recall score for lemmatized data is 0.9377358490566038
# F1 score for lemmatized data is 0.953020134228188

Precision score for unlemmatized data is 0.9686299615877081
Recall score for unlemmatized data is 0.9515723270440252
F1 score for unlemmatized data is 0.9600253807106599

Precision score for lemmatized data is 0.9688109161793372
Recall score for lemmatized data is 0.9377358490566038
F1 score for lemmatized data is 0.953020134228188


In [14]:
# want to test if multinomial NB is better for this problem
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

nb = MultinomialNB()
# recall that this model is fitted using unlemmatized data  
X_train, X_test, y_train, y_test = train_test_split(data_cv, df_nb['true_satisfactory_level'], test_size = 0.2, random_state = 20)
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print("Precision score for unlemmatized data is " + str(precision_score(y_test, y_pred)))
print("Recall score for unlemmatized data is " + str(recall_score(y_test, y_pred)))
print("F1 score for unlemmatized data is " + str(f1_score(y_test, y_pred)))

nb = MultinomialNB()
# recall that this model is fitted using lemmatized data 
X_train, X_test, y_train, y_test = train_test_split(data_cv2, df2_nb['true_satisfactory_level'], test_size = 0.2, random_state = 20)
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print("Precision score for lemmatized data is " + str(precision_score(y_test, y_pred)))
print("Recall score for lemmatized data is " + str(recall_score(y_test, y_pred)))
print("F1 score for lemmatized data is " + str(f1_score(y_test, y_pred)))

# When stop words are removed
# Precision score for unlemmatized data is 0.9492537313432836
# Recall score for unlemmatized data is 0.9968652037617555
# F1 score for unlemmatized data is 0.9724770642201835

# Precision score for lemmatized data is 0.9494047619047619
# Recall score for lemmatized data is 1.0
# F1 score for lemmatized data is 0.9740458015267175

# When stop words are not removed
# Precision score for unlemmatized data is 0.9494047619047619
# Recall score for unlemmatized data is 1.0
# F1 score for unlemmatized data is 0.9740458015267175

# Precision score for lemmatized data is 0.9494047619047619
# Recall score for lemmatized data is 1.0
# F1 score for lemmatized data is 0.9740458015267175

Precision score for unlemmatized data is 0.9492537313432836
Recall score for unlemmatized data is 0.9968652037617555
F1 score for unlemmatized data is 0.9724770642201835
Precision score for lemmatized data is 0.9494047619047619
Recall score for lemmatized data is 1.0
F1 score for lemmatized data is 0.9740458015267175


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

rf = RandomForestClassifier()
# recall that this model is fitted using unlemmatized data  
X_train, X_test, y_train, y_test = train_test_split(data_cv, df_nb['true_satisfactory_level'], test_size = 0.2, random_state = 20)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Precision score for unlemmatized data is " + str(precision_score(y_test, y_pred)))
print("Recall score for unlemmatized data is " + str(recall_score(y_test, y_pred)))
print("F1 score for unlemmatized data is " + str(f1_score(y_test, y_pred)))

nb = RandomForestClassifier()
# recall that this model is fitted using lemmatized data 
X_train, X_test, y_train, y_test = train_test_split(data_cv2, df2_nb['true_satisfactory_level'], test_size = 0.2, random_state = 20)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Precision score for lemmatized data is " + str(precision_score(y_test, y_pred)))
print("Recall score for lemmatized data is " + str(recall_score(y_test, y_pred)))
print("F1 score for lemmatized data is " + str(f1_score(y_test, y_pred)))

# When stop words are removed
# Precision score for unlemmatized data is 0.9494047619047619
# Recall score for unlemmatized data is 1.0
# F1 score for unlemmatized data is 0.9740458015267175

# Precision score for lemmatized data is 0.9494047619047619
# Recall score for lemmatized data is 1.0
# F1 score for lemmatized data is 0.9740458015267175

# When stop words are not removed
# Precision score for unlemmatized data is 0.9494047619047619
# Recall score for unlemmatized data is 1.0
# F1 score for unlemmatized data is 0.9740458015267175

# Precision score for lemmatized data is 0.9494047619047619
# Recall score for lemmatized data is 1.0
# F1 score for lemmatized data is 0.9740458015267175

Precision score for unlemmatized data is 0.9494047619047619
Recall score for unlemmatized data is 1.0
F1 score for unlemmatized data is 0.9740458015267175
Precision score for lemmatized data is 0.9494047619047619
Recall score for lemmatized data is 1.0
F1 score for lemmatized data is 0.9740458015267175


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

lr = LogisticRegression()
# recall that this model is fitted using unlemmatized data  
X_train, X_test, y_train, y_test = train_test_split(data_cv, df_nb['true_satisfactory_level'], test_size = 0.2, random_state = 20)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("Precision score for unlemmatized data is " + str(precision_score(y_test, y_pred)))
print("Recall score for unlemmatized data is " + str(recall_score(y_test, y_pred)))
print("F1 score for unlemmatized data is " + str(f1_score(y_test, y_pred)))

lr = LogisticRegression()
# recall that this model is fitted using lemmatized data 
X_train, X_test, y_train, y_test = train_test_split(data_cv2, df2_nb['true_satisfactory_level'], test_size = 0.2, random_state = 20)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("Precision score for lemmatized data is " + str(precision_score(y_test, y_pred)))
print("Recall score for lemmatized data is " + str(recall_score(y_test, y_pred)))
print("F1 score for lemmatized data is " + str(f1_score(y_test, y_pred)))

# When stop words are removed
# Precision score for unlemmatized data is 0.9520958083832335
# Recall score for unlemmatized data is 0.9968652037617555
# F1 score for unlemmatized data is 0.9739663093415007

# Precision score for lemmatized data is 0.9491017964071856
# Recall score for lemmatized data is 0.9937304075235109
# F1 score for lemmatized data is 0.9709035222052068

# When stop words are not removed
# Precision score for unlemmatized data is 0.9579579579579579
# Recall score for unlemmatized data is 1.0
# F1 score for unlemmatized data is 0.9785276073619632

# Precision score for lemmatized data is 0.9520958083832335
# Recall score for lemmatized data is 0.9968652037617555
# F1 score for lemmatized data is 0.9739663093415007

Precision score for unlemmatized data is 0.9520958083832335
Recall score for unlemmatized data is 0.9968652037617555
F1 score for unlemmatized data is 0.9739663093415007
Precision score for lemmatized data is 0.9491017964071856
Recall score for lemmatized data is 0.9937304075235109
F1 score for lemmatized data is 0.9709035222052068


In [17]:
# Conclusion

# From the data, we can see that TextBlob does quite a decent job in the F1 score aspect. However, there are other models that 
# are able to do a better job. We can see that using RandomForestClassifer, we are able to obtain a better recall and f1 score
# than using TextBlob.

# The f1 scores serves to show how balanced precision and recall is, namely if f1 score is high, it means that we have lower
# frequency of false positive and false negative respectively. This is actually quite useful when we want to improve business
# as we want to reduce false positive and false negative counts, which may affect business in both tangible and non-tangible
# terms.