# Sentiment Analysis for Hotel Reviews

## Importing Libraries

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pickle

## Loading the Data

In [None]:
# Load the dataset
data = pd.read_csv("hotel_reviews.csv")

# Total number of rows
total_rows = data.shape[0]
print(f"Total number of rows: {total_rows}")

data.head()

Total number of rows: 7001


Unnamed: 0,Index,Name,Area,Review_Date,Rating_attribute,Rating(Out of 10),Review_Text
0,0,Hotel The Pearl,"Paharganj, New Delhi",Jul-23,Best budget friendly hotel,9.0,Hotel the pearl is perfect place to stay in De...
1,1,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Amazing place,9.0,Location of the hotel is perfect. The hotel is...
2,2,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Overall good stay. Economic.,9.0,"Location, Indian food."
3,3,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Lovely,9.0,The location and the hotel itself is great. Ne...
4,4,Hotel The Pearl,"Paharganj, New Delhi",Aug-23,Great hotel Great staff and great staying,9.0,Friendly and smiling staffs.. The reception st...


In [4]:
# Find out the initial number of dupilcate rows
num_duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")

Number of duplicate rows: 0


## Extracting Key Information from Data

In [5]:
# Get rid of unnecessary columns
columns_needed = ['Review_Text', 'Rating(Out of 10)']
data = data[columns_needed]

data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
6996,"The room was good, comfortable and aesthetic \...",10.0
6997,good hotel,9.0
6998,good experience for me about hotel \r\nvery go...,10.0
6999,well done,10.0
7000,Nothing,2.0


In [6]:
# Total number of rows with missing values
num_missing_rows = data.isnull().any(axis=1).sum()

print(f"Number of rows with missing values: {num_missing_rows}")

Number of rows with missing values: 7


In [7]:
# Remove rows with any missing values
data = data.dropna()

# Total number of rows
total_rows = data.shape[0]
print(f"Total number of rows: {total_rows}")

Total number of rows: 6994


In [8]:
# Check if the review text is in English
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

# Apply the function to the 'Review_Text' column
data['is_english'] = data['Review_Text'].apply(is_english)

# Filter out rows that are not in English
data = data[data['is_english']].drop(columns=['is_english'])

# Total number of rows
total_rows = data.shape[0]
print(f"Total number of rows: {total_rows}")

Total number of rows: 6386


In [9]:
# Remove rows where the review text is ' no comments available for this review'
data = data[data['Review_Text'] != ' no comments available for this review']

# Remove rows where the review text is ' Nothing'
data = data[data['Review_Text'] != 'Nothing']

# Total number of rows
total_rows = data.shape[0]
print(f"Total number of rows: {total_rows}")

Total number of rows: 5655


In [10]:
# Remove \r\n from Review text
data['Review_Text'] = data['Review_Text'].str.replace(r'\r\n', ' ', regex=True)

data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10)
6992,nice hotel staff behavior is good hotel locati...,10.0
6993,The overall service was really nice. The hotel...,8.0
6994,peaceful place though it’s in corner of ind area,8.0
6996,"The room was good, comfortable and aesthetic ...",10.0
6998,good experience for me about hotel very good ...,10.0


## Giving the Review Score a Sentiment

In [11]:
# Add sentiment labels based on review score
def assign_sentiment(score):
    if score > 6:
        return 'positive'
    if score >= 5:
        return 'neutral'
    return 'negative'

# Apply the function to create a new 'Sentiment' column
data['Sentiment'] = data['Rating(Out of 10)'].apply(assign_sentiment)

# Filter and display positive reviews
print("Positive Reviews:")
display(data[data['Sentiment'] == 'positive'][['Review_Text', 'Rating(Out of 10)', 'Sentiment']].head())

# Filter and display neutral reviews
print("Neutral Reviews:")
display(data[data['Sentiment'] == 'neutral'][['Review_Text', 'Rating(Out of 10)', 'Sentiment']].head())

# Filter and display negative reviews
print("Negative Reviews:")
display(data[data['Sentiment'] == 'negative'][['Review_Text', 'Rating(Out of 10)', 'Sentiment']].head())

Positive Reviews:


Unnamed: 0,Review_Text,Rating(Out of 10),Sentiment
0,Hotel the pearl is perfect place to stay in De...,9.0,positive
1,Location of the hotel is perfect. The hotel is...,9.0,positive
2,"Location, Indian food.",9.0,positive
3,The location and the hotel itself is great. Ne...,9.0,positive
4,Friendly and smiling staffs.. The reception st...,9.0,positive


Neutral Reviews:


Unnamed: 0,Review_Text,Rating(Out of 10),Sentiment
15,staff service was good,6.0,neutral
111,"experience was good,",6.0,neutral
119,The staff was really kind and let us to put ou...,6.0,neutral
199,"Wi-fi was good , staff was friendly",5.0,neutral
200,Old type furniture,6.0,neutral


Negative Reviews:


Unnamed: 0,Review_Text,Rating(Out of 10),Sentiment
95,Nothing.,1.0,negative
96,Location was center of city,4.0,negative
120,Leaving the hotel,1.0,negative
145,· I regret taking the stay over there and ple...,1.0,negative
146,· It was a disaster. How can such a poor hote...,1.0,negative


## Splitting the Data (80% train, 20% test)

In [12]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Reset index to avoid potential indexing issues
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

# Total number of rows in train data
total_rows_train_data = train_data.shape[0]
print(f"Total number of rows in train data: {total_rows_train_data}")

# Total number of rows in test data
total_rows_test_data = test_data.shape[0]
print(f"Total number of rows in test data: {total_rows_test_data}")

train_data.tail()

Total number of rows in train data: 4524
Total number of rows in test data: 1131


Unnamed: 0,Review_Text,Rating(Out of 10),Sentiment
4519,· when requested they couldn’t offer me tooth...,6.0,neutral
4520,location,8.0,positive
4521,easy location,3.0,negative
4522,"Breakfast was awesome, the food was delicious....",10.0,positive
4523,Very much comfortable hotel.. Wonderful experi...,9.0,positive


## Text Preprocessing

In [13]:
# Convert train data text to lowercase
train_data['Review_Text'] = train_data['Review_Text'].str.lower()

train_data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10),Sentiment
4519,· when requested they couldn’t offer me tooth...,6.0,neutral
4520,location,8.0,positive
4521,easy location,3.0,negative
4522,"breakfast was awesome, the food was delicious....",10.0,positive
4523,very much comfortable hotel.. wonderful experi...,9.0,positive


In [14]:
# Remove punctuation
train_data['Review_Text'] = train_data['Review_Text'].str.translate(str.maketrans('', '', string.punctuation))

train_data.tail()

Unnamed: 0,Review_Text,Rating(Out of 10),Sentiment
4519,· when requested they couldn’t offer me tooth...,6.0,neutral
4520,location,8.0,positive
4521,easy location,3.0,negative
4522,breakfast was awesome the food was delicious f...,10.0,positive
4523,very much comfortable hotel wonderful experien...,9.0,positive


## Vectorize

In [15]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer to the train data and transform it
train_x_vectorized = vectorizer.fit_transform(train_data['Review_Text'])

# Fit the vectorizer to the train data and transform it
test_x_vectorized = vectorizer.transform(test_data['Review_Text'])

# View tokenized words
tokenized_words = vectorizer.get_feature_names_out()
print(f"Tokenized words: {tokenized_words[:150]}")  # Display the first 150 tokenized words

Tokenized words: ['05' '0500hrs' '05032023' '09' '0900' '10' '100' '1000' '100150' '1010'
 '1015' '1028' '1075' '10am' '10pm' '11' '1100' '110029' '1162' '11am'
 '12' '120' '1200' '1230am' '1230pm' '1240' '12pm' '12th' '13' '1300' '14'
 '144' '14aug' '15' '150' '1500' '15aug' '15mins' '16' '1600' '170' '1819'
 '1am' '1friendly' '1l' '1lit' '1min' '1person' '1st' '1water' '20' '200'
 '2000' '2018' '2021' '2022' '2023' '21' '2200' '23' '230am' '24' '2400'
 '247' '24h' '24hours' '24hrs' '24th25th' '24x7' '25' '250' '26' '2612'
 '2712' '27hour' '2am' '2km' '2nd' '30' '300' '3000' '300m' '30second'
 '315' '32' '33' '330' '34' '345am' '38which' '39' '3rd' '3time' '4050'
 '40degc' '449' '45' '45kms' '4ft' '4hrs' '4th' '500' '5000' '50130' '55'
 '598' '5hours' '5seconds' '60' '6000' '610' '615' '640' '649' '650' '67'
 '6am' '6hours' '6pm' '700' '700m' '750' '7623' '769' '7am' '800' '810'
 '830am' '8n' '900' '9000' '910' '9415' 'aakash' 'aall' 'aaminties'
 'aashram' 'aayush' 'abandoned' 'abd' '

## Spliting Train and Test Data to X and Y values

In [16]:
train_x = train_x_vectorized
train_y = train_data['Sentiment']

test_x = test_x_vectorized
test_y = test_data['Sentiment']

## Training and Evaluating Logistic Regression Model

In [17]:
# Initialize and train the model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(train_x, train_y)

# Predict on test data
predictions = logreg_model.predict(test_x_vectorized)

# Evaluate model performance
print("Classification Report for Logistic Regression:")
print(classification_report(test_y, predictions))

print("Confusion Matrix:")
print(confusion_matrix(test_y, predictions))

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

    negative       0.86      0.74      0.80       142
     neutral       0.61      0.34      0.44       111
    positive       0.91      0.98      0.94       878

    accuracy                           0.89      1131
   macro avg       0.79      0.69      0.73      1131
weighted avg       0.87      0.89      0.87      1131

Confusion Matrix:
[[105  10  27]
 [ 11  38  62]
 [  6  14 858]]


## Training and Evaluating Random Forest

In [18]:
# Initialize and train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_x, train_y)

# Predict on test data
predictions = rf_model.predict(test_x_vectorized)

# Evaluate model performance
print("Classification Report for Random Forest:")
print(classification_report(test_y, predictions))

print("Confusion Matrix:")
print(confusion_matrix(test_y, predictions))

Classification Report for Random Forest:
              precision    recall  f1-score   support

    negative       0.94      0.72      0.82       142
     neutral       0.78      0.46      0.58       111
    positive       0.90      0.99      0.94       878

    accuracy                           0.90      1131
   macro avg       0.88      0.72      0.78      1131
weighted avg       0.90      0.90      0.89      1131

Confusion Matrix:
[[102   4  36]
 [  3  51  57]
 [  3  10 865]]


## Training and Evaluating SVM

In [19]:
# Initialize and train the model
svm_model = SVC(kernel='linear')
svm_model.fit(train_x, train_y)

# Predict on test data
predictions = svm_model.predict(test_x)

# Evaluate model performance
print("Classification Report for SVM:")
print(classification_report(test_y, predictions))

print("Confusion Matrix:")
print(confusion_matrix(test_y, predictions))

Classification Report for SVM:
              precision    recall  f1-score   support

    negative       0.81      0.77      0.79       142
     neutral       0.59      0.36      0.45       111
    positive       0.91      0.96      0.94       878

    accuracy                           0.88      1131
   macro avg       0.77      0.70      0.72      1131
weighted avg       0.87      0.88      0.87      1131

Confusion Matrix:
[[109   8  25]
 [ 13  40  58]
 [ 12  20 846]]


## Hyperparameter Tuning Random Forest Model (Best Model)

In [None]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Initialize the base model
rf_base = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_base,
                           param_grid=param_grid,
                           cv=3,
                           scoring='accuracy',
                           n_jobs=-1,
                           verbose=2)

# Fit GridSearchCV on training data
grid_search.fit(train_x, train_y)

# Best model after tuning
best_rf_model = grid_search.best_estimator_

# Predict on test data
predictions = best_rf_model.predict(test_x)

# Evaluate model performance
print("Best Parameters:", grid_search.best_params_)
print("\nClassification Report for Tuned Random Forest:")
print(classification_report(test_y, predictions))

print("Confusion Matrix:")
print(confusion_matrix(test_y, predictions))


Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   2.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.7s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total

## Saving the Model

In [None]:
# Save the best (tuned) Random Forest model
with open('tuned_random_forest_model.pkl', 'wb') as f:
    pickle.dump(best_rf_model, f)

# Save Vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)