In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import re
import string
import nltk
from tqdm import trange
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import tokenize
from nltk.probability import FreqDist
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('/content/sample_data/tripadvisor_hotel_reviews.csv.zip')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


* No null value
* Review is object and rating is int
* 2 column

In [None]:
df.describe()

Unnamed: 0,Rating
count,20491.0
mean,3.952223
std,1.23303
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0




*   Rating varies between 1 to 5
*   25% people give review less than 3 and more than 25% rated 5



In [None]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [None]:
df['Rating'].value_counts()

Unnamed: 0_level_0,count
Rating,Unnamed: 1_level_1
5,9054
4,6039
3,2184
2,1793
1,1421


So we can see around 75% rate more than 4

In [None]:
fig=go.Figure(data=go.Pie(labels=df['Rating'].value_counts().index, values=df['Rating'].value_counts().values))
fig.update_layout(
    title='Rating Distribution'
)

In [None]:
import spacy
nlp=spacy.load('en_core_web_sm')

def clean_text(text):
  doc=nlp(text)
  tokens=[token.lemma_.lower() for token in doc
          if not token.is_stop and not token.is_punct and token.is_alpha]
  return ' '.join(tokens)

df['Review']=df['Review'].apply(clean_text)

In [None]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking get good deal sta...,4
1,ok special charge diamond member hilton decide...,2
2,nice room experience hotel monaco seattle good...,3
3,unique great stay wonderful time hotel monaco ...,5
4,great stay great stay go seahawk game awesome ...,5


In [None]:
def word_count(text):
  return len(text.split())
df['word_count']=df['Review'].apply(word_count)

In [None]:
def cha_count(text):
  return len(text)
df['cha_count']=df['Review'].apply(cha_count)

In [None]:
df.head()

Unnamed: 0,Review,Rating,word_count,cha_count
0,nice hotel expensive parking get good deal sta...,4,82,539
1,ok special charge diamond member hilton decide...,2,222,1454
2,nice room experience hotel monaco seattle good...,3,184,1176
3,unique great stay wonderful time hotel monaco ...,5,84,545
4,great stay great stay go seahawk game awesome ...,5,172,1102


In [None]:
def main_word(text):
  Vectorizer=CountVectorizer()
  x= Vectorizer.fit_transform(text)
  freq=x.toarray().sum(axis=0)
  words=Vectorizer.get_feature_names_out()
  word_dict = {word: int(f) for word, f in zip(words, freq)}
  return word_dict
word_collection=main_word(df['Review'].to_list())


In [None]:
from collections import Counter

top_words = dict(Counter(word_collection).most_common(20))
print(top_words)

{'hotel': 52871, 'room': 46645, 'stay': 27395, 'good': 21418, 'great': 21288, 'staff': 16371, 'night': 14150, 'day': 13035, 'nice': 12992, 'time': 12202, 'location': 11254, 'service': 10705, 'clean': 10620, 'restaurant': 10218, 'beach': 10161, 'breakfast': 9707, 'place': 9707, 'like': 9317, 'food': 9303, 'walk': 9158}


In [None]:
exclude_word=['hotel','room', 'stay','place']

final_word={word:count for word, count in top_words.items() if word not in exclude_word}

In [None]:
final_word

{'good': 21418,
 'great': 21288,
 'staff': 16371,
 'night': 14150,
 'day': 13035,
 'nice': 12992,
 'time': 12202,
 'location': 11254,
 'service': 10705,
 'clean': 10620,
 'restaurant': 10218,
 'beach': 10161,
 'breakfast': 9707,
 'like': 9317,
 'food': 9303,
 'walk': 9158}

In [None]:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Apply sentiment analysis to each review
df['Sentiment_Score'] = df['Review'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Optionally, classify the sentiment
df['Sentiment_Label'] = df['Sentiment_Score'].apply(
    lambda x: 'Positive' if x > 0.05 else ('Negative' if x < -0.05 else 'Neutral')
)

# Show result
df[['Review', 'Sentiment_Score', 'Sentiment_Label']].head()


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


Unnamed: 0,Review,Sentiment_Score,Sentiment_Label
0,nice hotel expensive parking get good deal sta...,0.9808,Positive
1,ok special charge diamond member hilton decide...,0.995,Positive
2,nice room experience hotel monaco seattle good...,0.994,Positive
3,unique great stay wonderful time hotel monaco ...,0.9949,Positive
4,great stay great stay go seahawk game awesome ...,0.9939,Positive


In [None]:
df['Rating_N']=df['Rating'].apply(lambda x: 'Positive' if x>=4 else ('Neutral'if x==3 else 'Negative'))

In [None]:
def comp(row):
  if row['Sentiment_Label']==row['Rating_N']:
    return 0
  else:
    return 1
df['comp']=df.apply(comp,axis=1)

In [None]:
sum_comp=df['comp'].sum()

In [None]:
sum_comp

np.int64(4544)

In [None]:
def main_word2(text):
  cv = CountVectorizer(ngram_range=(2,3))
  x = cv.fit_transform(text)
  freq=x.toarray().sum(axis=0)
  words=cv.get_feature_names_out()
  word_dict = {word: int(f) for word, f in zip(words, freq)}
  return word_dict
word_collection2=main_word2(df['Review'].to_list())

from collections import Counter

top_words2 = dict(Counter(word_collection2).most_common(20))


In [None]:
top_words2

{'stay hotel': 2716,
 'staff friendly': 2208,
 'great location': 2160,
 'room clean': 2080,
 'punta cana': 1696,
 'minute walk': 1673,
 'hotel stay': 1618,
 'stay night': 1527,
 'hotel great': 1516,
 'great hotel': 1491,
 'recommend hotel': 1453,
 'hotel room': 1374,
 'friendly helpful': 1352,
 'highly recommend': 1315,
 'room service': 1305,
 'hotel staff': 1252,
 'place stay': 1228,
 'room small': 1218,
 'staff helpful': 1091,
 'good hotel': 1061}