In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [3]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [4]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import time
import os
import spacy, nltk, re
from nltk.corpus import stopwords
nltk.download('vader_lexicon')
nltk.download('movie_reviews')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from pycorenlp import StanfordCoreNLP
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from textblob import Blobber

from sklearn import metrics
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, SVMSMOTE, BorderlineSMOTE
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


[nltk_data] Downloading package vader_lexicon to C:\Users\Zhifang
[nltk_data]     Xie\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package movie_reviews to C:\Users\Zhifang
[nltk_data]     Xie\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Zhifang
[nltk_data]     Xie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Using TensorFlow backend.


In [5]:
cons_data = pd.read_csv('cons_data.csv', index_col=0)

In [6]:
cons_data.head()

Unnamed: 0,review_number,cons,facility_ind,security_ind,pricing_ind,location_ind,fb_ind,housekeep_ind,frontoff_ind,others,pos_ind
0,0,bedroom very bland,1,0,0,0,0,0,0,0,0
1,0,no soft furnishings,1,0,0,0,0,0,0,0,0
2,0,too little furniture in a big space,1,0,0,0,0,0,0,0,0
3,0,uncomfortable couch area and needs colour or i...,1,0,0,0,0,0,0,0,0
4,0,hotel could do with a residents bar for a rela...,0,0,0,0,1,0,0,0,0


In [7]:
cons_data.shape

(8139, 11)

In [8]:
cons_data.isnull().sum()

review_number    0
cons             1
facility_ind     0
security_ind     0
pricing_ind      0
location_ind     0
fb_ind           0
housekeep_ind    0
frontoff_ind     0
others           0
pos_ind          0
dtype: int64

In [9]:
cons_data.dropna(subset=['cons'], inplace=True)
cons_data.shape

(8138, 11)

In [10]:
cons_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8138 entries, 0 to 8138
Data columns (total 11 columns):
review_number    8138 non-null int64
cons             8138 non-null object
facility_ind     8138 non-null int64
security_ind     8138 non-null int64
pricing_ind      8138 non-null int64
location_ind     8138 non-null int64
fb_ind           8138 non-null int64
housekeep_ind    8138 non-null int64
frontoff_ind     8138 non-null int64
others           8138 non-null object
pos_ind          8138 non-null int64
dtypes: int64(9), object(2)
memory usage: 762.9+ KB


In [11]:
corpus = cons_data.cons
corpus = corpus.str.lower().apply(lambda x: str(x).encode('ascii', errors='ignore').decode())
corpus = corpus.replace(r'http\S+', ' ', regex=True)
corpus = corpus.replace(r'[^A-Za-z]', ' ', regex=True)
corpus = corpus.replace(r' ', np.nan).dropna()
corpus = corpus.astype(str).apply(nltk.word_tokenize)
lemma = nltk.WordNetLemmatizer() 
corpus = corpus.apply(lambda x: ' '.join([lemma.lemmatize(word) for word in x]))  

In [12]:
corpus.shape

(8117,)

In [13]:
df = cons_data[cons_data.index.isin(corpus.index)].copy()
df.head()

Unnamed: 0,review_number,cons,facility_ind,security_ind,pricing_ind,location_ind,fb_ind,housekeep_ind,frontoff_ind,others,pos_ind
0,0,bedroom very bland,1,0,0,0,0,0,0,0,0
1,0,no soft furnishings,1,0,0,0,0,0,0,0,0
2,0,too little furniture in a big space,1,0,0,0,0,0,0,0,0
3,0,uncomfortable couch area and needs colour or i...,1,0,0,0,0,0,0,0,0
4,0,hotel could do with a residents bar for a rela...,0,0,0,0,1,0,0,0,0


In [14]:
df.pos_ind.value_counts()

0    7722
1     395
Name: pos_ind, dtype: int64

In [15]:
df['neg_ind'] = 1 - df.pos_ind

#### Split of Dataset

In [16]:
train, val = train_test_split(df, test_size=0.2, stratify=df.neg_ind, random_state=42)

In [17]:
X_train = train.cons
y_train = train.neg_ind
X_val = val.cons
y_val = val.neg_ind

#### NLTK Sentiment Analysis

In [18]:
sid = SentimentIntensityAnalyzer()
nltk_scores_train = train.cons.map(sid.polarity_scores)

In [19]:
nltk_scores = val.cons.map(sid.polarity_scores)
nltk_ind = nltk_scores.apply(lambda x: 0 if x.get('compound')>0.05 else 1)
print('Validation set f1 score using NLTK sentiment analysis: ', metrics.f1_score(val.neg_ind, nltk_ind))

Validation set f1 score using NLTK sentiment analysis:  0.8497257769652651


In [20]:
print('Confusion matrix: \n', metrics.confusion_matrix(y_val, nltk_ind))
print('Classification report: \n', metrics.classification_report(y_val, nltk_ind))
print('ROC_AUC score: ', metrics.roc_auc_score(y_val, nltk_ind))

Confusion matrix: 
 [[  51   28]
 [ 383 1162]]
Classification report: 
               precision    recall  f1-score   support

           0       0.12      0.65      0.20        79
           1       0.98      0.75      0.85      1545

    accuracy                           0.75      1624
   macro avg       0.55      0.70      0.52      1624
weighted avg       0.93      0.75      0.82      1624

ROC_AUC score:  0.6988365900618573


#### TextBlob Sentiment Analysis

In [35]:
textblob_scores = val.cons.apply(lambda x: TextBlob(x).sentiment)
textblob_ind = textblob_scores.apply(lambda x: 0 if x.polarity>0.05 else 1)
print('Validation set f1 score using TextBlob sentiment analysis: ', metrics.f1_score(val.neg_ind, textblob_ind))

Validation set f1 score using TextBlob sentiment analysis:  0.7983257229832572


In [36]:
print('Confusion matrix: \n', metrics.confusion_matrix(y_val, textblob_ind))
print('Classification report: \n', metrics.classification_report(y_val, textblob_ind))
print('ROC_AUC score: ', metrics.roc_auc_score(y_val, textblob_ind))

Confusion matrix: 
 [[  45   34]
 [ 496 1049]]
Classification report: 
               precision    recall  f1-score   support

           0       0.08      0.57      0.15        79
           1       0.97      0.68      0.80      1545

    accuracy                           0.67      1624
   macro avg       0.53      0.62      0.47      1624
weighted avg       0.93      0.67      0.77      1624

ROC_AUC score:  0.6242923272295275


#### TextBlob (NB analyzer) Sentiment Analysis

In [22]:
tb = Blobber(analyzer=NaiveBayesAnalyzer())

In [44]:
blobbernb_scores = val.cons.apply(lambda x: tb(x).sentiment.classification)
blobbernb_ind = blobbernb_scores.map({'pos':0, 'neg':1})
print('Validation set f1 score using Textblob (NB Analyzer) Sentiment Analysis: ', metrics.f1_score(val.neg_ind, blobbernb_ind))

Validation set f1 score using Textblob (NB Analyzer) Sentiment Analysis:  0.5132075471698112


In [24]:
print('Confusion matrix: \n', metrics.confusion_matrix(y_val, blobbernb_ind))
print('Classification report: \n', metrics.classification_report(y_val, blobbernb_ind))
print('ROC_AUC score: ', metrics.roc_auc_score(y_val, blobbernb_ind))

Confusion matrix: 
 [[  48   31]
 [1001  544]]
Classification report: 
               precision    recall  f1-score   support

           0       0.05      0.61      0.09        79
           1       0.95      0.35      0.51      1545

    accuracy                           0.36      1624
   macro avg       0.50      0.48      0.30      1624
weighted avg       0.90      0.36      0.49      1624

ROC_AUC score:  0.4798492482897055


#### Logistic Regression

In [25]:
rus = RandomUnderSampler()
ros = RandomOverSampler()
smote = SMOTE(kind='regular')

In [26]:
tfidf_vect = TfidfVectorizer()
lr = LogisticRegression(max_iter=1000)

In [27]:
pipe = Pipeline([('tfidf', tfidf_vect),('rus', rus),('lr', lr)])
cv_score = cross_val_score(pipe, X_train, y_train, scoring='f1', cv=5).mean()
print('Cross validation score for RandomUnderSampling: ', cv_score)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_val)
val_score = metrics.f1_score(y_val, y_pred)
print('Validation set f1-score for RandomUnderSampling: ', val_score)

Cross validation score for RandomUnderSampling:  0.9562918945902323
Validation set f1-score for RandomUnderSampling:  0.9577371048252913


In [28]:
pipe = Pipeline([('tfidf', tfidf_vect),('ros', ros),('lr', lr)])
cv_score = cross_val_score(pipe, X_train, y_train, scoring='f1', cv=5).mean()
print('Cross validation score for RandomOverSampling: ', cv_score)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_val)
val_score = metrics.f1_score(y_val, y_pred)
print('Validation set f1-score for RandomOverSampling: ', val_score)

Cross validation score for RandomOverSampling:  0.9717413992287985
Validation set f1-score for RandomOverSampling:  0.9676994067237971


In [29]:
pipe = Pipeline([('tfidf', tfidf_vect),('smote', smote),('lr', lr)])
cv_score = cross_val_score(pipe, X_train, y_train, scoring='f1', cv=5).mean()
print('Cross validation score for SMOTE: ', cv_score)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_val)
val_score = metrics.f1_score(y_val, y_pred)
print('Validation set f1-score for SMOTE: ', val_score)

Cross validation score for SMOTE:  0.9726273081226958
Validation set f1-score for SMOTE:  0.9707525468287873


In [30]:
ngram_range = [(1,1),(1,2),(1,3),(2,2),(2,3)]
use_idf = [True, False]
C = [0.01,0.1,1,10,100]
b_model = None
b_score = 0

for (nr, ui, c) in [(nr, ui, c) for nr in ngram_range for ui in use_idf for c in C]:
    tfidf_vect = TfidfVectorizer(ngram_range=nr, use_idf=ui)
    smote = SMOTE(sampling_strategy=1, kind='regular')
    lr = LogisticRegression(C=c, max_iter=1000)
    pipe = Pipeline([('tfidf', tfidf_vect),('smote', smote),('lr', lr)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_val)
    val_score = metrics.f1_score(y_val, y_pred)
    if val_score > b_score:
        b_score = val_score
        b_model = pipe
        b_pred = y_pred

In [31]:
print('Validation set f1 score from the best model: ', b_score)
print('Best parameters: ')
print('ngram_range:',b_model.get_params()['tfidf__ngram_range'])
print('use_idf:', b_model.get_params()['tfidf__use_idf'])
print('C:',b_model.get_params()['lr__C'])

Validation set f1 score from the best model:  0.9812297734627832
Best parameters: 
ngram_range: (1, 3)
use_idf: False
C: 100


In [32]:
print('Confusion matrix: \n', metrics.confusion_matrix(y_val, b_pred))
print('Classification report: \n', metrics.classification_report(y_val, b_pred)) 
print('ROC_AUC score: ', metrics.roc_auc_score(y_val, b_pred))

Confusion matrix: 
 [[  50   29]
 [  29 1516]]
Classification report: 
               precision    recall  f1-score   support

           0       0.63      0.63      0.63        79
           1       0.98      0.98      0.98      1545

    accuracy                           0.96      1624
   macro avg       0.81      0.81      0.81      1624
weighted avg       0.96      0.96      0.96      1624

ROC_AUC score:  0.8070705829339232


#### Ensemble

In [33]:
sentiment_ind = pd.DataFrame(np.column_stack((b_pred, nltk_ind, textblob_ind)),columns=['LR model', 'NLTK', 'TextBlob']).mode(axis=1)

In [35]:
print('Validation set f1 score from ensemble model: ', metrics.f1_score(y_val, sentiment_ind))

Validation set f1 score from ensemble model:  0.9026178010471204


In [36]:
print('Confusion matrix: \n', metrics.confusion_matrix(y_val, sentiment_ind))
print('Classification report: \n', metrics.classification_report(y_val, sentiment_ind))
print('ROC_AUC score: ', metrics.roc_auc_score(y_val, sentiment_ind))

Confusion matrix: 
 [[  52   27]
 [ 252 1293]]
Classification report: 
               precision    recall  f1-score   support

           0       0.17      0.66      0.27        79
           1       0.98      0.84      0.90      1545

    accuracy                           0.83      1624
   macro avg       0.58      0.75      0.59      1624
weighted avg       0.94      0.83      0.87      1624

ROC_AUC score:  0.7475605259923804


In [39]:
df.drop('pos_ind', axis=1, inplace=True)
df.to_csv('data_sentiment_analysis.csv')