Import Libraries

In [None]:
import time
import pandas as pd
import nltk as nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
import joblib

nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\d-kin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

1. Setup

In [47]:
vader_sentiment = SentimentIntensityAnalyzer()


In [48]:
# There are 3 possibilities of input for a review:
# It could be "No Negative", in which case, return 0
# It could be "No Positive", in which case, return 0
# It could be a review, in which case calculate the sentiment
def calc_sentiment(review):    
    if review == "No Negative" or review == "No Positive":
        return 0
    return vader_sentiment.polarity_scores(review)["compound"]    


2. Load Data

In [49]:
# Load the hotel reviews from CSV
df = pd.read_csv("../../data/Hotel_Reviews_Filtered.csv")


3. Define sentiment & stop-word routines

In [50]:
# Remove stop words - can be slow for a lot of text!
# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches
# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends
start = time.time()
cache = set(stopwords.words("english"))
def remove_stopwords(review):
    text = " ".join([word for word in review.split() if word not in cache])
    return text


4. Clean text

In [51]:
# Remove the stop words from both columns
df.Negative_Review = df.Negative_Review.apply(remove_stopwords)   
df.Positive_Review = df.Positive_Review.apply(remove_stopwords)


In [52]:
end = time.time()
print("Removing stop words took " + str(round(end - start, 2)) + " seconds")


Removing stop words took 3.25 seconds


5. Compute sentiment scores

In [53]:
# Add a negative sentiment and positive sentiment column
print("Calculating sentiment columns for both positive and negative reviews")
start = time.time()
df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment)
df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment)
end = time.time()
print("Calculating sentiment took " + str(round(end - start, 2)) + " seconds")


Calculating sentiment columns for both positive and negative reviews
Calculating sentiment took 140.06 seconds


In [54]:
df = df.sort_values(by=["Negative_Sentiment"], ascending=True)
print(df[["Negative_Review", "Negative_Sentiment"]])
df = df.sort_values(by=["Positive_Sentiment"], ascending=True)
print(df[["Positive_Review", "Positive_Sentiment"]])


                                          Negative_Review  Negative_Sentiment
186584  So bad experience memories I hotel The first n...             -0.9920
129503  First charged twice room booked booking second...             -0.9896
307286  The staff Had bad experience even booking Janu...             -0.9889
201953  Everything DO NOT STAY AT THIS HOTEL I never i...             -0.9886
452092  No WLAN room Incredibly rude restaurant staff ...             -0.9884
...                                                   ...                 ...
138365  Wifi terribly slow I speed test network upload...              0.9938
79215   I find anything hotel first I walked past hote...              0.9938
278506  The property great location There bakery next ...              0.9945
339189  Guys I like hotel I wish return next year Howe...              0.9948
480509  I travel lot far visited countless number hote...              0.9957

[515738 rows x 2 columns]
                                     

6. Reorder columns

In [55]:
# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)
df = df.reindex(["Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score", 
                "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment", "Reviewer_Nationality", 
                "Leisure_trip", "Couple", "Solo_traveler", "Business_trip", "Group", 
                "Family_with_young_children", "Family_with_older_children", "With_a_pet", 
                "Negative_Review", "Positive_Review"], axis=1)


7. Save final NLP‐ready CSV

In [56]:
print("Saving results to Hotel_Reviews_NLP.csv")
df.to_csv(r"../../data/Hotel_Reviews_NLP.csv", index = False)


Saving results to Hotel_Reviews_NLP.csv


8. Load final NLP‐ready CSV

In [57]:
df = pd.read_csv('../../data/Hotel_Reviews_NLP.csv')
# for classification: make y = (Reviewer_Score >= 8).astype(int)


10. Vectorize Text

In [58]:
# make sure no NaN’s sneak into the vectorizer
pos = df.Positive_Review.fillna("")
neg = df.Negative_Review.fillna("")

# combine into one corpus
corpus = pos + " " + neg
hv = HashingVectorizer(n_features=20_000, alternate_sign=False, norm=None)    
X_counts = hv.transform(corpus)
tf = TfidfTransformer(norm='l2')
X_text = tf.fit_transform(X_counts)


11. Assemble Features

In [59]:
tag_prefixes = [
  "Leisure_trip","Couple","Solo_traveler","Business_trip",
  "Group","Family_with_young_children",
  "Family_with_older_children","With_a_pet"
]
top_tags = [c for c in df.columns if c in tag_prefixes]

X_meta = df[['Negative_Sentiment','Positive_Sentiment',
             'Average_Score','Total_Number_of_Reviews'] + top_tags].values
X = sp.hstack([X_text, X_meta])


12. Train & Test model

In [60]:
# Split data
y = (df['Reviewer_Score'] >= 8).astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Fit Random Forest model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


13. Evaluate and Save model

In [None]:
print(classification_report(y_test, clf.predict(X_test)))
joblib.dump((tfidf, clf), 'hotel_rating_model.pkl')


              precision    recall  f1-score   support

           0       0.76      0.60      0.67     36019
           1       0.81      0.90      0.85     67129

    accuracy                           0.80    103148
   macro avg       0.79      0.75      0.76    103148
weighted avg       0.79      0.80      0.79    103148



['hotel_rating_model.pkl']