In [1]:
# Libraries 

import pandas as pd
import numpy as np
import csv
import os
import time
import pycountry as pcty
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import re
import torch
import string
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# ***** import vader_dataset instead of re-run the code ***** 
filename = 'vader_dataset.csv' 

if os.path.exists(filename):
    vader_dataset = pd.read_csv(filename)

In [5]:
vader_dataset.sample(5)

Unnamed: 0,id,vader_neg,vader_neu,vader_pos,vader_compound,hotelname,comment_count,rating,rating_word,location_rating,...,contributes,helpfulvotes_guest,guest_rating_bubble,review,dateofStay,guestCountry,hotelname_encode,dateofStay_encode,guestCountry_encode,polarity_vader
60428,60943,0.089,0.636,0.275,0.962,Pullman Kuala Lumpur Bangsar,1716,4.0,1,3.5,...,98.0,76.0,3.0,luxury reasonable price stayed first week sept...,September 2014,India,16,92014,61,positive
19069,19092,0.0,0.597,0.403,0.926,Grand Hyatt Kuala Lumpur,5462,4.5,2,5.0,...,18.0,16.0,5.0,great hotel stayed girlfriend arrived late joh...,September 2013,Not Mentioned,7,92013,105,positive
21391,21416,0.0,0.548,0.452,0.9829,Shangri-La Kuala Lumpur,8548,4.5,2,4.5,...,623.0,204.0,5.0,best buffet breakfast stayed premier selection...,May 2017,Australia,20,52017,9,positive
33604,33635,0.0,0.464,0.536,0.9911,"Mandarin Oriental, Kuala Lumpur",7269,4.5,2,5.0,...,10.0,5.0,5.0,special experience husband recently stayed hot...,March 2017,United Kingdom,14,32017,142,positive
3321,3324,0.0,0.717,0.283,0.9776,Hilton Kuala Lumpur,6421,4.5,2,4.5,...,23.0,4.0,5.0,always home away home th stay hilton last four...,July 2019,Indonesia,9,72019,62,positive


In [6]:
# Select only the relevant columns
data = vader_dataset[['hotelname', 'review', 'polarity_vader']]


In [10]:
data.sample(10)

Unnamed: 0,hotelname,review,polarity_vader
2418,Hilton Kuala Lumpur,perfect hotel kuala lumpur excellent service e...,positive
9539,8 Kia Peng Suites,best experience kl long term say family person...,positive
58950,Royale Chulan Kuala Lumpur,superb hotel fault stayed royal chulan day chr...,positive
12486,Le Meridien Kuala Lumpur,comfy stay hubby stayed night sponsored actual...,positive
21858,Shangri-La Kuala Lumpur,year vacation warm welcome staff excellent ser...,positive
18420,Grand Hyatt Kuala Lumpur,fabulous hotel superb location recently stayed...,positive
3011,Hilton Kuala Lumpur,beautiful room great breakfast hotel accessibl...,positive
18620,Grand Hyatt Kuala Lumpur,hyatt kuala lumpur top hotel room amazing view...,positive
42347,The Westin Kuala Lumpur,avoid possible stayed westin kl oct checked fr...,negative
29869,"InterContinental Kuala Lumpur, an IHG Hotel",good location friendly staff first staff kl go...,positive


In [11]:
vader_dataset[vader_dataset['polarity_vader'] == 'negative']

Unnamed: 0,id,vader_neg,vader_neu,vader_pos,vader_compound,hotelname,comment_count,rating,rating_word,location_rating,...,contributes,helpfulvotes_guest,guest_rating_bubble,review,dateofStay,guestCountry,hotelname_encode,dateofStay_encode,guestCountry_encode,polarity_vader
40,41,0.230,0.672,0.098,-0.9166,EQ Kuala Lumpur,575,5.0,2,5.0,...,1.0,0.0,5.0,unforgetable delightful stay eq hotel booked n...,November 2022,Not Mentioned,5,112022,105,negative
203,204,0.193,0.635,0.173,-0.2263,EQ Kuala Lumpur,575,5.0,2,5.0,...,64.0,64.0,1.0,insensitive receptionist seemingly oblivious f...,December 2021,Malaysia,5,122021,84,negative
265,266,0.079,0.849,0.072,-0.1513,EQ Kuala Lumpur,575,5.0,2,5.0,...,10110.0,240.0,5.0,perfection equatorial one older name kuala lum...,February 2021,Malaysia,5,22021,84,negative
598,599,0.169,0.700,0.131,-0.3612,Lanson Place Bukit Ceylon Kuala Lumpur,1064,5.0,2,5.0,...,1.0,3.0,2.0,rude unprofessional hospitality never experien...,May 2022,Japan,12,52022,68,negative
751,752,0.144,0.766,0.090,-0.3637,Lanson Place Bukit Ceylon Kuala Lumpur,1064,5.0,2,5.0,...,87.0,85.0,1.0,high rise building noisy initially gave star s...,January 2020,United Kingdom,12,12020,142,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63535,64052,0.189,0.693,0.118,-0.4939,E&O Residences Kuala Lumpur,946,4.0,1,4.5,...,1.0,0.0,3.0,location great difficult evaluate hotel pick h...,September 2017,Not Mentioned,4,92017,105,negative
63543,64060,0.221,0.611,0.168,-0.7648,E&O Residences Kuala Lumpur,946,4.0,1,4.5,...,300.0,148.0,2.0,worn place avoid dont understand excellent rat...,August 2017,Belgium,4,82017,17,negative
63546,64063,0.430,0.506,0.064,-0.9423,E&O Residences Kuala Lumpur,946,4.0,1,4.5,...,0.0,0.0,1.0,bad experience hotel side pool side nice cozy ...,August 2017,Not Mentioned,4,82017,105,negative
63547,64064,0.102,0.855,0.043,-0.4215,E&O Residences Kuala Lumpur,946,4.0,1,4.5,...,0.0,0.0,1.0,الفندق غير جيد الفطور اقل من عادي طابور عند ال...,August 2017,Not Mentioned,4,82017,105,negative


In [12]:
# Create a list of unique hotel names
hotel_names = data['hotelname'].unique()

In [13]:
hotel_names

array(['EQ Kuala Lumpur', 'Lanson Place Bukit Ceylon Kuala Lumpur',
       'Banyan Tree Kuala Lumpur', 'Hilton Kuala Lumpur',
       'Four Seasons Hotel Kuala Lumpur', 'The RuMa Hotel and Residences',
       '8 Kia Peng Suites', 'Le Meridien Kuala Lumpur',
       'Sofitel Kuala Lumpur Damansara', 'Grand Hyatt Kuala Lumpur',
       'W Hotels Kuala Lumpur', 'Shangri-La Kuala Lumpur',
       'Pavilion Hotel Kuala Lumpur Managed by Banyan Tree Kuala Lumpur',
       'InterContinental Kuala Lumpur, an IHG Hotel',
       'Mandarin Oriental, Kuala Lumpur', 'The St. Regis Kuala Lumpur',
       'Ascott Kuala Lumpur', 'Alila Bangsar Kuala Lumpur',
       'The Ritz-Carlton, Kuala Lumpur', 'The Westin Kuala Lumpur',
       'JW Marriott Hotel Kuala Lumpur',
       'The Majestic Hotel Kuala Lumpur', 'The Face Suites',
       'Pullman Kuala Lumpur City Centre Hotel & Residences',
       'The Gardens-A St Giles Signature Hotel & Residence',
       'Renaissance Kuala Lumpur Hotel & Convention Centre',
 

### IDF Value for Top 5 Feature Words in negative reviews for each Hotel

In [16]:
# Get only the negative reviews
negative_reviews = vader_dataset[vader_dataset['polarity_vader'] == 'negative']

# Group the negative reviews by hotelname
grouped_reviews = negative_reviews.groupby('hotelname')

# Create a TfidfVectorizer object with English stopwords
vectorizer = TfidfVectorizer(stop_words='english')

# Loop through each hotel and get the top 10 feature words
for hotel, group in grouped_reviews:
    # Fit the vectorizer on the hotel's negative reviews
    vectorizer.fit(group['review'])
    # Get the feature names and their corresponding tf-idf scores
    feature_names = vectorizer.get_feature_names_out() #get_feature_names()
    tfidf_scores = vectorizer.transform(group['review'])
    # Convert the tf-idf scores to a pandas DataFrame
    tfidf_df = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=group['id'])
    # Get the top 10 feature words for the hotel
    top_features = tfidf_df.sum(axis=1).nlargest(5)
    print(f"Hotel: {hotel}")
    print(top_features)
    print('\n')


Hotel: 8 Kia Peng Suites
refund    0.536141
money     0.402106
pm        0.394535
small     0.338813
quite     0.313986
dtype: float64


Hotel: Alila Bangsar Kuala Lumpur
room       2.439821
hotel      1.830454
check      1.386901
pm         1.307858
service    1.147229
dtype: float64


Hotel: Ascott Kuala Lumpur
room       3.903782
service    2.911682
hotel      2.487037
time       2.210659
bad        2.151975
dtype: float64


Hotel: Banyan Tree Kuala Lumpur
hotel     1.509718
room      1.488126
tea       1.179182
tree      1.108069
banyan    1.106643
dtype: float64


Hotel: E&O Residences Kuala Lumpur
room         3.425365
bad          2.249203
hotel        1.936291
apartment    1.784479
towel        1.512590
dtype: float64


Hotel: EQ Kuala Lumpur
hotel     0.556802
blue      0.379825
booked    0.379825
sky       0.379825
kl        0.317279
dtype: float64


Hotel: Four Seasons Hotel Kuala Lumpur
room       2.178194
hotel      1.925075
season     1.194037
day        1.177019
service 

### IDF Value for Top 10 Feature Words in negative reviews 

In [18]:
vectorizer.fit(negative_reviews['review'])
# Get the feature names and their corresponding tf-idf scores
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = vectorizer.transform(negative_reviews['review'])

# Convert the tf-idf scores to a pandas DataFrame
tfidf_df = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=negative_reviews['id'])

# Get the top 10 feature words for the hotel
top_features = tfidf_df.sum(axis=1).nlargest(10)

print(top_features)
print('\n')

room         143.729324
hotel        130.292333
service       79.574239
staff         65.854857
stay          59.798086
time          57.429851
check         52.704619
bad           52.137996
breakfast     51.232002
star          50.667537
dtype: float64


