# Load data

We first start by loading the raw data for Hotel reviews and Amazon reviews.

In [21]:
import pandas as pd

# read data Amazon reviews and hotel reviews
Amazonreviews_df = pd.read_csv("AmazonReviews.csv")
reviews_df = pd.read_csv("Hotel_Reviews.csv")

# append the positive and negative text reviews
reviews_df["review"] = reviews_df["Negative_Review"] + reviews_df["Positive_Review"]

# create the label
reviews_df["is_neagtive_review"] = reviews_df["Reviewer_Score"].apply(lambda x: 1 if x < 5 else 0)

# select only relevant columns for Hotel reviews
reviews_df = reviews_df[["review", "is_neagtive_review"]]

# select only relevant columns for Amazon reviews
Amazonreviews_df = pd.DataFrame(Amazonreviews_df,columns = ['Score','Text'])
Amazonreviews_df.rename(columns = {'Score':'Rating','Text':'Review'},inplace = True)

#function to do sentiment for amazon reviews if the review rating is
# less or equal to 3 it is neagtive, if it bigger than 3 it is positive

def sentimnent_for_amazon(rating):
    if(rating<=3):
        return 0
    else :
        return 1

#appllying the function on the rating colomun
Amazonreviews_df['is_neagtive_review'] = Amazonreviews_df['Rating'].apply(sentimnent_for_amazon)
Amazonreviews_df.drop(['Rating'],axis = 1, inplace = True)
print('Hotel Reviews')
print(reviews_df.head())

print('\n')

print('Amazon Reviews')
print(Amazonreviews_df.head())



Hotel Reviews
                                              review  is_neagtive_review
0   I am so angry that i made this post available...                   1
1  No Negative No real complaints the hotel was g...                   0
2   Rooms are nice but for elderly a bit difficul...                   0
3   My room was dirty and I was afraid to walk ba...                   1
4   You When I booked with your company on line y...                   0


Amazon Reviews
                                              Review  is_neagtive_review
0  I have bought several of the Vitality canned d...                   1
1  Product arrived labeled as Jumbo Salted Peanut...                   0
2  This is a confection that has been around a fe...                   1
3  If you are looking for the secret ingredient i...                   0
4  Great taffy at a great price.  There was a wid...                   1


In [22]:
reviews_df['predicted_review']=reviews_df.apply(lambda x: 1 if 'angry' in x['review'] else 0, axis=1)
Amazonreviews_df['predicted_review']=Amazonreviews_df.apply(lambda x: 1 if 'disappointed' in x['Review'] else 0, axis=1)

In [23]:
print(reviews_df[reviews_df['predicted_review']==reviews_df['is_neagtive_review']].shape)
Amazonreviews_df[Amazonreviews_df['predicted_review']==Amazonreviews_df['is_neagtive_review']].shape

(493387, 3)


(122689, 3)

In [24]:

print(Amazonreviews_df.shape)
print(reviews_df.shape)

(568454, 3)
(515738, 3)


In [25]:
49340/51574

0.9566836002636988

In [26]:
print('Hotel Reviews')
print(reviews_df['is_neagtive_review'].value_counts())

print('\n')

print('Amazon Reviews')
print(Amazonreviews_df['is_neagtive_review'].value_counts())

Hotel Reviews
0    493457
1     22281
Name: is_neagtive_review, dtype: int64


Amazon Reviews
1    443777
0    124677
Name: is_neagtive_review, dtype: int64


In [27]:
print('Hotel Reviews positive and negative ratio')
print(reviews_df['is_neagtive_review'].value_counts(normalize=True))

print('\n')

print('Amazon Reviews positive and negative ratio')
print(Amazonreviews_df['is_neagtive_review'].value_counts(normalize=True))


Hotel Reviews positive and negative ratio
0    0.956798
1    0.043202
Name: is_neagtive_review, dtype: float64


Amazon Reviews positive and negative ratio
1    0.780674
0    0.219326
Name: is_neagtive_review, dtype: float64


In [9]:
reviews_df['new_stupid_model']=0

In [None]:
reviews_df[reviews_df['new_stupid_model']==reviews_df['is_neagtive_review']].shape

# Sample data

In [28]:
reviews_df = reviews_df.sample(frac = 0.1, replace = False, random_state=42)
Amazonreviews_df = Amazonreviews_df.sample(frac = 0.1, replace = False, random_state=42)

Reviews data is sampled in order to speed up computations because data is very large around 500k entries.

# Clean data

If the user doesn't leave any negative feedback comment, this will appear as "No Negative" in our data. This is the same for the positive comments with the default value "No Positive". We have to remove those parts from our texts.

The next step consists in cleaning the text data with various operations:

In [29]:
# remove 'No Negative' or 'No Positive' from text
reviews_df["review"] = reviews_df["review"].apply(lambda x: x.replace("No Negative", "").replace("No Positive", ""))
Amazonreviews_df["Review"] = Amazonreviews_df["Review"].apply(lambda x: x.replace("No Negative", "").replace("No Positive", ""))

In [30]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Harith\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Harith\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Harith\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Harith\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

To clean textual data, we call our custom 'clean_text' function that performs several transformations:
- lower the text
- tokenize the text (split the text into words) and remove the punctuation
- remove useless words that contain numbers
- remove useless stop words like 'the', 'a' ,'this' etc.
- Part-Of-Speech (POS) tagging: assign a tag to every word to define if it corresponds to a noun, a verb etc. using the WordNet lexical database
- lemmatize the text: transform every word into their root form (e.g. rooms -> room, slept -> sleep)

Now that we have cleaned our data, we can do some feature engineering for our modelization part.

In [31]:
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
reviews_df["review_clean"] = reviews_df["review"].apply(lambda x: clean_text(x))
Amazonreviews_df["review_clean"] = Amazonreviews_df["Review"].apply(lambda x: clean_text(x))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Harith\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Harith\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Feature engineering

We first start by adding sentiment analysis features because we can guess that customers reviews are highly linked to how they felt about their stay at the hotel. We use Vader, which is a part of the NLTK module designed for sentiment analysis. Vader uses a lexicon of words to find which ones are positives or negatives. It also takes into accout the context of the sentences to determine the sentiment scores. For each text, Vader retuns 4 values:
- a neutrality score
- a positivity score
- a negativity score
- an overall score that summarizes the previous scores

We will integrate those 4 values as features in our dataset.

In [32]:
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
reviews_df["sentiments"] = reviews_df["review"].apply(lambda x: sid.polarity_scores(x))
reviews_df = pd.concat([reviews_df.drop(['sentiments'], axis=1), reviews_df['sentiments'].apply(pd.Series)], axis=1)

Amazonreviews_df["sentiments"] = Amazonreviews_df["Review"].apply(lambda x: sid.polarity_scores(x))
Amazonreviews_df = pd.concat([Amazonreviews_df.drop(['sentiments'], axis=1), Amazonreviews_df['sentiments'].apply(pd.Series)], axis=1)

Next, we add some simple metrics for every text:
- number of characters in the text
- number of words in the text

In [33]:
# add number of characters column
reviews_df["nb_chars"] = reviews_df["review"].apply(lambda x: len(x))
Amazonreviews_df["nb_chars"] = Amazonreviews_df["Review"].apply(lambda x: len(x))

# add number of words column
reviews_df["nb_words"] = reviews_df["review"].apply(lambda x: len(x.split(" ")))
Amazonreviews_df["nb_words"] = Amazonreviews_df["Review"].apply(lambda x: len(x.split(" ")))

In [34]:
#Hotel Reviews
# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

Hotel_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews_df["review_clean"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(Hotel_documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
Hotel_doc2vec_df = reviews_df["review_clean"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
Hotel_doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in Hotel_doc2vec_df.columns]
reviews_df = pd.concat([reviews_df, Hotel_doc2vec_df], axis=1)

The next step consist in extracting vector representations for every review. The module Gensim creates a numerical vector representation of every word in the corpus by using the contexts in which they appear (Word2Vec). This is performed using shallow neural networks. What's interesting is that similar words will have similar representation vectors.

Each text can also be transformed into numerical vectors using the word vectors (Doc2Vec). Same texts will also have similar representations and that is why we can use those vectors as training features.

We first have to train a Doc2Vec model by feeding in our text data. By applying this model on our reviews, we can get those representation vectors.

In [35]:
#Amazon Reviews 
# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

Amazon_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(Amazonreviews_df["review_clean"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(Amazon_documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
Amazon_doc2vec_df = Amazonreviews_df["review_clean"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
Amazon_doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in Amazon_doc2vec_df.columns]
Amazonreviews_df = pd.concat([Amazonreviews_df, Amazon_doc2vec_df], axis=1)

In [63]:
reviews_df.head()

Unnamed: 0,review,is_neagtive_review,predicted_review,review_clean,neg,neu,pos,compound,nb_chars,nb_words,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4
488440,Would have appreciated a shop in the hotel th...,0,0,would appreciate shop hotel sell drinking wate...,0.049,0.617,0.334,0.9924,599,113,-0.197011,0.223198,-0.294666,0.063046,-0.393775
274649,No tissue paper box was present at the room,0,0,tissue paper box present room,0.216,0.784,0.0,-0.296,44,10,-0.038253,0.15103,0.091067,-0.079733,-0.01262
374688,Pillows Nice welcoming and service,0,0,pillow nice welcome service,0.0,0.345,0.655,0.6908,36,7,0.024773,0.067819,0.078592,-0.100598,0.012816
404352,Everything including the nice upgrade The Hot...,0,0,everything include nice upgrade hotel revamp s...,0.0,0.621,0.379,0.9153,155,27,0.049246,0.202737,-0.032925,-0.078608,0.031952
451596,Lovely hotel v welcoming staff,0,0,lovely hotel welcome staff,0.0,0.23,0.77,0.7717,32,7,-0.064352,0.148502,0.015996,-0.077464,-0.042423


In [117]:
Amazonreviews_df.head()

Unnamed: 0,Review,is_neagtive_review,predicted_review,review_clean,neg,neu,pos,compound,nb_chars,nb_words,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4
165256,Having tried a couple of other brands of glute...,1,0,tried couple brand gluten-free sandwich cooky ...,0.0,0.768,0.232,0.9684,485,87,0.255631,0.543345,0.316511,0.141402,-0.03142
231465,My cat loves these treats. If ever I can't fin...,1,0,cat love treat ever can't find house pop top b...,0.089,0.766,0.144,0.792,490,99,-0.078403,0.205094,0.073949,-0.509898,-0.354058
427827,A little less than I expected. It tends to ha...,0,0,little less expected tends muddy taste expect ...,0.0,0.88,0.12,0.4588,136,29,0.120996,0.146771,0.109834,-0.118422,0.041968
433954,"First there was Frosted Mini-Wheats, in origin...",0,0,first frost mini-wheats original size frost mi...,0.009,0.827,0.163,0.9923,1631,294,-0.586294,0.190088,1.158856,-0.142448,-0.280257
70260,and I want to congratulate the graphic artist ...,1,0,want congratulate graphic artist put entire pr...,0.089,0.719,0.191,0.9421,649,127,-0.073308,0.135404,0.335084,-0.43005,0.216708


# Exploratory data analysis


In order to have a better understanding of our data, let's explore it a little:

In [64]:
# show is_bad_review distribution
print(reviews_df["is_neagtive_review"].value_counts())
Amazonreviews_df["is_neagtive_review"].value_counts()

0    0.956761
1    0.043239
Name: is_neagtive_review, dtype: float64


1    0.78276
0    0.21724
Name: is_neagtive_review, dtype: float64

In [None]:
# wordcloud function

from wordcloud import WordCloud
import matplotlib.pyplot as plt

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color = 'white',
        max_words = 200,
        max_font_size = 40, 
        scale = 3,
        random_state = 42
    ).generate(str(data))

    fig = plt.figure(1, figsize = (20, 20))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)

    plt.imshow(wordcloud)
    plt.show()
    
# print wordcloud
show_wordcloud(reviews_df["review"])

Most of the words are indeed related to the hotels: room, staff, breakfast, etc. Some words are more related to the customer experience with the hotel stay: perfect, loved, expensive, dislike, etc.

In [None]:
# highest positive sentiment reviews (with more than 5 words)
reviews_df[reviews_df["nb_words"] >= 5].sort_values("pos", ascending = False)[["review", "pos"]].head(10)

The most positive reviews indeed correspond to some good feedbacks.

In [None]:
# lowest negative sentiment reviews (with more than 5 words)
reviews_df[reviews_df["nb_words"] >= 5].sort_values("neg", ascending = False)[["review", "neg"]].head(10)

Some errors can be found among the most negative reviews: Vader sometimes interpret 'no' or 'nothing' as negative words whereas they are sometimes used to say that there were no problems with the hotel. Fortunately, most of the reviews are indeed bad ones.

In [None]:
# plot sentiment distribution for positive and negative reviews

import seaborn as sns

for x in [0, 1]:
    subset = reviews_df[reviews_df['is_neagtive_review'] == x]
    
    # Draw the density plot
    if x == 0:
        label = "Good reviews"
    else:
        label = "Bad reviews"
    sns.distplot(subset['compound'], hist = False, label = label)

The above graph shows the distribution of the reviews sentiments among good reviews and bad ones. We can see that good reviews are for most of them considered as very positive by Vader. On the contrary, bad reviews tend to have lower compound sentiment scores.

This shows us that previously computed sentiment features will be very important in our modelling part.

# Modelling reviewer_score

We first choose which features we want to use to train our model. Then we split our data into two parts:
- one to train our model
- one to assess its performances

We will next use a Random Forest (RF) classifier for our predictions.

In [111]:
# feature selection
label = "is_neagtive_review"
Hotel_ignore_cols = [label, "review", "review_clean"]
Amazon_ignore_cols = [label, "Review", "review_clean"]
Hotel_features = [c for c in reviews_df.columns if c not in Hotel_ignore_cols]
Amazon_features = [c for c in Amazonreviews_df.columns if c not in Amazon_ignore_cols]

# split the data into train and test
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_trainamazon, X_testamazon, y_trainamazon, y_testamazon = train_test_split(Amazonreviews_df[Amazon_features], Amazonreviews_df[label], test_size = 0.2, random_state = 42)
X_trainhotel, X_testhotel, y_trainhotel, y_testhotel = train_test_split(reviews_df[Hotel_features], reviews_df[label], test_size = 0.40, random_state = 42)

In [112]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
# Q: fit
clf.fit(X_trainhotel ,y_trainhotel)
# Q: predict
y_pred = clf.predict(X_testamazon)
print(classification_report(y_pred, y_testamazon))

              precision    recall  f1-score   support

           0       0.99      0.22      0.35     11340
           1       0.00      0.31      0.00        29

    accuracy                           0.22     11369
   macro avg       0.50      0.26      0.18     11369
weighted avg       0.99      0.22      0.35     11369



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [113]:
from sklearn.naive_bayes import GaussianNB 
#Train using GaussianNB classifier 
clf = GaussianNB()
# Q: fit
clf.fit(X_trainhotel, y_trainhotel)
# Q: predict
y_pred=clf.predict(X_testamazon)
print(classification_report(y_pred, y_testamazon))


              precision    recall  f1-score   support

           0       0.60      0.18      0.27      8337
           1       0.23      0.67      0.34      3032

    accuracy                           0.31     11369
   macro avg       0.41      0.42      0.31     11369
weighted avg       0.50      0.31      0.29     11369



In [114]:
# train a random forest classifier
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_trainhotel, y_trainhotel)
y_pred=rf.predict(X_testamazon)
print(classification_report(y_pred, y_testamazon))
# show feature importance
feature_importances_df = pd.DataFrame({"feature": Amazon_features, "importance": rf.feature_importances_}).sort_values("importance", ascending = False)
feature_importances_df.head(20)

              precision    recall  f1-score   support

           0       0.98      0.21      0.35     11315
           1       0.00      0.17      0.00        54

    accuracy                           0.21     11369
   macro avg       0.49      0.19      0.18     11369
weighted avg       0.98      0.21      0.35     11369



Unnamed: 0,feature,importance
4,compound,0.13216
7,doc2vec_vector_0,0.121658
9,doc2vec_vector_2,0.09994
8,doc2vec_vector_1,0.091996
10,doc2vec_vector_3,0.09125
11,doc2vec_vector_4,0.090977
5,nb_chars,0.082761
1,neg,0.076208
3,pos,0.073111
2,neu,0.072171


The most important features are indeed the ones that come from the previous sentiment analysis. The vector representations of the texts also have a lot of importance in our training. Some words appear to have a fairly good importance as well.

In [124]:
# train a Support Vector Machine classifier

from sklearn import svm

clf = svm.SVC()
y_pred = clf.fit(X_trainhotel, y_trainhotel)
print(classification_report(y_pred, y_testamazon))




SVC()


In [121]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=5)

# Q: fit
clf.fit(X_trainhotel ,y_trainhotel)
# Q: predict
y_pred = clf.predict(X_testamazon)
print(classification_report(y_pred, y_testamazon))


              precision    recall  f1-score   support

           0       0.99      0.22      0.35     11330
           1       0.00      0.59      0.01        39

    accuracy                           0.22     11369
   macro avg       0.50      0.40      0.18     11369
weighted avg       0.99      0.22      0.35     11369

