# Sentiment Analysis for Hotel Reviews

## Importing Libraries

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string

## Loading the Data

In [15]:
# Load the dataset

data = pd.read_csv("hotel_reviews.csv")

data.head()

Unnamed: 0,Index,Name,Area,Review_Date,Rating_attribute,Rating(Out of 10),Review_Text
0,0,Hotel The Pearl,"Paharganj, New Delhi",Jul-23,Best budget friendly hotel,9.0,Hotel the pearl is perfect place to stay in De...
1,1,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Amazing place,9.0,Location of the hotel is perfect. The hotel is...
2,2,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Overall good stay. Economic.,9.0,"Location, Indian food."
3,3,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Lovely,9.0,The location and the hotel itself is great. Ne...
4,4,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Great hotel Great staff and great staying,9.0,Friendly and smiling staffs.. The reception st...


In [16]:
# Finding out the initial number of dupilcate rows

num_duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")

Number of duplicate rows: 0


## Extracting Key Information from Data

In [17]:
# Get rid of unnecessary columns

columns_needed = ['Review_Text', 'Rating(Out of 10)']
data = data[columns_needed]

data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
6996,"The room was good, comfortable and aesthetic \...",10.0
6997,good hotel,9.0
6998,good experience for me about hotel \r\nvery go...,10.0
6999,well done,10.0
7000,Nothing,2.0


In [18]:
# Remove rows with any missing values in the selected columns

data = data.dropna()

data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
6996,"The room was good, comfortable and aesthetic \...",10.0
6997,good hotel,9.0
6998,good experience for me about hotel \r\nvery go...,10.0
6999,well done,10.0
7000,Nothing,2.0


In [19]:
# Remove \r\n from Review text

data['Review_Text'] = data['Review_Text'].str.replace(r'\r\n', ' ', regex=True)

data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
6996,"The room was good, comfortable and aesthetic ...",10.0
6997,good hotel,9.0
6998,good experience for me about hotel very good ...,10.0
6999,well done,10.0
7000,Nothing,2.0


## Splitting the Data (80% train, 20% test)

In [20]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Reset index to avoid potential indexing issues

train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

train_data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
5590,All good,9.0
5591,no comments available for this review,1.0
5592,Quite and safe location. Care taker was very h...,9.0
5593,no comments available for this review,5.0
5594,The rooftop is really nice. My room was good too,9.0


## Text Preprocessing

In [21]:
# Convert train data text to lowercase

train_data['Review_Text'] = train_data['Review_Text'].str.lower()

train_data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
5590,all good,9.0
5591,no comments available for this review,1.0
5592,quite and safe location. care taker was very h...,9.0
5593,no comments available for this review,5.0
5594,the rooftop is really nice. my room was good too,9.0


In [22]:
# Remove punctuation

train_data['Review_Text'] = train_data['Review_Text'].str.translate(str.maketrans('', '', string.punctuation))

train_data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
5590,all good,9.0
5591,no comments available for this review,1.0
5592,quite and safe location care taker was very he...,9.0
5593,no comments available for this review,5.0
5594,the rooftop is really nice my room was good too,9.0
