# Sentiment Analysis for Hotel Reviews

## Importing Libraries

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

## Loading the Data

In [43]:
# Load the dataset
data = pd.read_csv("hotel_reviews.csv")

# Total number of rows
total_rows = data.shape[0]
print(f"Total number of rows: {total_rows}")

data.head()

Total number of rows: 7001


Unnamed: 0,Index,Name,Area,Review_Date,Rating_attribute,Rating(Out of 10),Review_Text
0,0,Hotel The Pearl,"Paharganj, New Delhi",Jul-23,Best budget friendly hotel,9.0,Hotel the pearl is perfect place to stay in De...
1,1,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Amazing place,9.0,Location of the hotel is perfect. The hotel is...
2,2,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Overall good stay. Economic.,9.0,"Location, Indian food."
3,3,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Lovely,9.0,The location and the hotel itself is great. Ne...
4,4,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Great hotel Great staff and great staying,9.0,Friendly and smiling staffs.. The reception st...


In [44]:
# Find out the initial number of dupilcate rows
num_duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")

Number of duplicate rows: 0


## Extracting Key Information from Data

In [45]:
# Get rid of unnecessary columns
columns_needed = ['Review_Text', 'Rating(Out of 10)']
data = data[columns_needed]

data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
6996,"The room was good, comfortable and aesthetic \...",10.0
6997,good hotel,9.0
6998,good experience for me about hotel \r\nvery go...,10.0
6999,well done,10.0
7000,Nothing,2.0


In [46]:
# Total number of rows with missing values
num_missing_rows = data.isnull().any(axis=1).sum()

print(f"Number of rows with missing values: {num_missing_rows}")

Number of rows with missing values: 7


In [47]:
# Remove rows with any missing values
data = data.dropna()

# Total number of rows
total_rows = data.shape[0]
print(f"Total number of rows: {total_rows}")

Total number of rows: 6994


In [48]:
# Check if the review text is in English
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Apply the function to the 'Review_Text' column
data['is_english'] = data['Review_Text'].apply(is_english)

# Filter out rows that are not in English
data = data[data['is_english']].drop(columns=['is_english'])

# Total number of rows
total_rows = data.shape[0]
print(f"Total number of rows: {total_rows}")

Total number of rows: 6381


In [49]:
# Remove rows where the review text is ' no comments available for this review'
data = data[data['Review_Text'] != ' no comments available for this review']

# Remove rows where the review text is ' Nothing'
data = data[data['Review_Text'] != 'Nothing']

# Total number of rows
total_rows = data.shape[0]
print(f"Total number of rows: {total_rows}")

Total number of rows: 5650


In [50]:
# Remove \r\n from Review text
data['Review_Text'] = data['Review_Text'].str.replace(r'\r\n', ' ', regex=True)

data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
6992,nice hotel staff behavior is good hotel locati...,10.0
6993,The overall service was really nice. The hotel...,8.0
6994,peaceful place though it’s in corner of ind area,8.0
6996,"The room was good, comfortable and aesthetic ...",10.0
6998,good experience for me about hotel very good ...,10.0


## Giving the Review Score a Sentiment

In [51]:
# Add sentiment labels based on review score
def assign_sentiment(score):
    if score > 6:
        return 'positive'
    if score >= 5:
        return 'neutral'
    return 'negative'

# Apply the function to create a new 'Sentiment' column
data['Sentiment'] = data['Rating(Out of 10)'].apply(assign_sentiment)

# Filter and display positive reviews
print("Positive Reviews:")
display(data[data['Sentiment'] == 'positive'][['Review_Text', 'Rating(Out of 10)', 'Sentiment']].head())

# Filter and display neutral reviews
print("Neutral Reviews:")
display(data[data['Sentiment'] == 'neutral'][['Review_Text', 'Rating(Out of 10)', 'Sentiment']].head())

# Filter and display negative reviews
print("Negative Reviews:")
display(data[data['Sentiment'] == 'negative'][['Review_Text', 'Rating(Out of 10)', 'Sentiment']].head())

Positive Reviews:


Unnamed: 0,Review_Text,Rating(Out of 10),Sentiment
0,Hotel the pearl is perfect place to stay in De...,9.0,positive
1,Location of the hotel is perfect. The hotel is...,9.0,positive
2,"Location, Indian food.",9.0,positive
3,The location and the hotel itself is great. Ne...,9.0,positive
4,Friendly and smiling staffs.. The reception st...,9.0,positive


Neutral Reviews:


Unnamed: 0,Review_Text,Rating(Out of 10),Sentiment
15,staff service was good,6.0,neutral
111,"experience was good,",6.0,neutral
119,The staff was really kind and let us to put ou...,6.0,neutral
199,"Wi-fi was good , staff was friendly",5.0,neutral
200,Old type furniture,6.0,neutral


Negative Reviews:


Unnamed: 0,Review_Text,Rating(Out of 10),Sentiment
96,Location was center of city,4.0,negative
120,Leaving the hotel,1.0,negative
145,· I regret taking the stay over there and ple...,1.0,negative
146,· It was a disaster. How can such a poor hote...,1.0,negative
171,Dirty bathroom and rooms .They didn’t even kep...,2.0,negative


## Splitting the Data (80% train, 20% test)

In [52]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Reset index to avoid potential indexing issues
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# Total number of rows in train data
total_rows_train_data = train_data.shape[0]
print(f"Total number of rows in train data: {total_rows_train_data}")

# Total number of rows in test data
total_rows_test_data = test_data.shape[0]
print(f"Total number of rows in test data: {total_rows_test_data}")

train_data.tail()

Total number of rows in train data: 4520
Total number of rows in test data: 1130


Unnamed: 0,Review_Text,Rating(Out of 10),Sentiment
4515,"Location was very nice, safety, atmosphere was...",7.0,positive
4516,Location And concept,8.0,positive
4517,Cool,5.0,neutral
4518,It was a really good experience. The room was ...,10.0,positive
4519,Best budget friendly hotel in karol bagh.. The...,9.0,positive


## Text Preprocessing

In [53]:
# Convert train data text to lowercase
train_data['Review_Text'] = train_data['Review_Text'].str.lower()

train_data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10),Sentiment
4515,"location was very nice, safety, atmosphere was...",7.0,positive
4516,location and concept,8.0,positive
4517,cool,5.0,neutral
4518,it was a really good experience. the room was ...,10.0,positive
4519,best budget friendly hotel in karol bagh.. the...,9.0,positive


In [54]:
# Remove punctuation
train_data['Review_Text'] = train_data['Review_Text'].str.translate(str.maketrans('', '', string.punctuation))

train_data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10),Sentiment
4515,location was very nice safety atmosphere was g...,7.0,positive
4516,location and concept,8.0,positive
4517,cool,5.0,neutral
4518,it was a really good experience the room was c...,10.0,positive
4519,best budget friendly hotel in karol bagh the c...,9.0,positive


## Vectorize

In [55]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer to the train data and transform it
train_x_vectorized = vectorizer.fit_transform(train_data['Review_Text'])

# Fit the vectorizer to the train data and transform it
test_x_vectorized = vectorizer.transform(test_data['Review_Text'])

# View tokenized words
tokenized_words = vectorizer.get_feature_names_out()
print(f"Tokenized words: {tokenized_words[:150]}")  # Display the first 150 tokenized words

Tokenized words: ['05' '0500hrs' '05032023' '0900' '10' '100' '1000' '1000pm' '100150'
 '1010' '1015' '1028' '10am' '10pm' '1100' '110029' '1162' '11pm' '12'
 '120' '1200' '1230am' '1230pm' '1240' '12pm' '12th' '13' '130' '1300'
 '130am' '14' '144' '14aug' '15' '150' '1500' '15aug' '15mins' '16' '1600'
 '170' '1819' '1friendly' '1l' '1lit' '1min' '1person' '1st' '1the'
 '1water' '20' '200' '2000' '2018' '2021' '2022' '21' '2200' '230am' '24'
 '2400' '247' '24h' '24hour' '24hours' '24hrs' '24x7' '25' '250' '26'
 '27hour' '2am' '2km' '2nd' '2yo' '30' '300' '3000' '300m' '30second'
 '315' '32' '33' '330' '34' '345am' '38which' '39' '3rd' '3time' '4050'
 '40degc' '45' '4ft' '4hrs' '4th' '5000' '50130' '55' '598' '5hours'
 '5seconds' '5th' '60' '6000' '610' '615' '640' '650' '67' '6am' '700'
 '700m' '7623' '769' '7am' '800' '86' '8804' '8n' '900' '9000' '91' '910'
 '9415' '9906' 'aakash' 'aall' 'aashram' 'aayush' 'abandoned' 'abd'
 'abdar' 'abdul' 'abhendra' 'abhishek' 'able' 'about' 'above

## Spliting Train and Test Data to X and Y values

In [56]:
train_x = train_x_vectorized
train_y = train_data['Sentiment']

test_x = test_x_vectorized
test_y = test_data['Sentiment']

## Training and Evaluating Logistic Regression Model

In [57]:
# Initialize and train the model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(train_x, train_y)

# Predict on test data
predictions = logreg_model.predict(test_x_vectorized)

# Evaluate model performance
print("Classification Report for Logistic Regression:")
print(classification_report(test_y, predictions))

print("Confusion Matrix:")
print(confusion_matrix(test_y, predictions))

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

    negative       0.84      0.68      0.75       173
     neutral       0.61      0.34      0.44       127
    positive       0.87      0.96      0.91       830

    accuracy                           0.85      1130
   macro avg       0.77      0.66      0.70      1130
weighted avg       0.83      0.85      0.83      1130

Confusion Matrix:
[[117   8  48]
 [ 11  43  73]
 [ 12  19 799]]
