# Sentiment Analysis for Hotel Reviews

## Importing Libraries

In [192]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string

## Loading the Data

In [193]:
# Load the dataset
data = pd.read_csv("hotel_reviews.csv")

# Total number of rows
total_rows = data.shape[0]
print(f"Total number of rows: {total_rows}")

data.head()

Total number of rows: 7001


Unnamed: 0,Index,Name,Area,Review_Date,Rating_attribute,Rating(Out of 10),Review_Text
0,0,Hotel The Pearl,"Paharganj, New Delhi",Jul-23,Best budget friendly hotel,9.0,Hotel the pearl is perfect place to stay in De...
1,1,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Amazing place,9.0,Location of the hotel is perfect. The hotel is...
2,2,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Overall good stay. Economic.,9.0,"Location, Indian food."
3,3,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Lovely,9.0,The location and the hotel itself is great. Ne...
4,4,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Great hotel Great staff and great staying,9.0,Friendly and smiling staffs.. The reception st...


In [194]:
# Finding out the initial number of dupilcate rows
num_duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")

Number of duplicate rows: 0


## Extracting Key Information from Data

In [195]:
# Get rid of unnecessary columns
columns_needed = ['Review_Text', 'Rating(Out of 10)']
data = data[columns_needed]

data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
6996,"The room was good, comfortable and aesthetic \...",10.0
6997,good hotel,9.0
6998,good experience for me about hotel \r\nvery go...,10.0
6999,well done,10.0
7000,Nothing,2.0


In [196]:
# Total number of rows with missing values
num_missing_rows = data.isnull().any(axis=1).sum()

print(f"Number of rows with missing values: {num_missing_rows}")

Number of rows with missing values: 7


In [197]:
# Remove rows with any missing values
data = data.dropna()

# Total number of rows
total_rows = data.shape[0]
print(f"Total number of rows: {total_rows}")

Total number of rows: 6994


In [198]:
# Remove rows where the review text is ' no comments available for this review'
data = data[data['Review_Text'] != ' no comments available for this review']

# Remove rows where the review text is ' Nothing'
data = data[data['Review_Text'] != 'Nothing']

# Total number of rows
total_rows = data.shape[0]
print(f"Total number of rows: {total_rows}")

Total number of rows: 6263


In [199]:
# Remove \r\n from Review text
data['Review_Text'] = data['Review_Text'].str.replace(r'\r\n', ' ', regex=True)

data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
6994,peaceful place though it’s in corner of ind area,8.0
6996,"The room was good, comfortable and aesthetic ...",10.0
6997,good hotel,9.0
6998,good experience for me about hotel very good ...,10.0
6999,well done,10.0


## Splitting the Data (80% train, 20% test)

In [200]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Reset index to avoid potential indexing issues
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# Total number of rows in train data
total_rows_train_data = train_data.shape[0]
print(f"Total number of rows in train data: {total_rows_train_data}")

# Total number of rows in test data
total_rows_test_data = test_data.shape[0]
print(f"Total number of rows in test data: {total_rows_test_data}")

train_data.tail()

Total number of rows in train data: 5010
Total number of rows in test data: 1253


Unnamed: 0,Review_Text,Rating(Out of 10)
5005,The property was on the main road itself. It h...,6.0
5006,Spacious rooms with plenty of sunshine,9.0
5007,Staff impecable Room very clean and bigger tha...,8.0
5008,A very friendly welcome. A spacious room and w...,10.0
5009,Satisfied service. I like it.,8.0


## Text Preprocessing

In [202]:
# Convert train data text to lowercase
train_data['Review_Text'] = train_data['Review_Text'].str.lower()

train_data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
5005,the property was on the main road itself. it h...,6.0
5006,spacious rooms with plenty of sunshine,9.0
5007,staff impecable room very clean and bigger tha...,8.0
5008,a very friendly welcome. a spacious room and w...,10.0
5009,satisfied service. i like it.,8.0


In [203]:
# Remove punctuation
train_data['Review_Text'] = train_data['Review_Text'].str.translate(str.maketrans('', '', string.punctuation))

train_data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
5005,the property was on the main road itself it ha...,6.0
5006,spacious rooms with plenty of sunshine,9.0
5007,staff impecable room very clean and bigger tha...,8.0
5008,a very friendly welcome a spacious room and wo...,10.0
5009,satisfied service i like it,8.0
