In [3]:
import warnings
warnings.filterwarnings('ignore')
# warnings.simplefilter('ignore')

# pip install seaborn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
hotels = pd.read_csv('hotel_reviews.csv')
hotels.head()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,Longitude,Latitude
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,4.915968,52.360576
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,4.915968,52.360576
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,4.915968,52.360576
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,4.915968,52.360576
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/24/2017,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,4.915968,52.360576


In [5]:
hotels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515738 entries, 0 to 515737
Data columns (total 17 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   Hotel_Address                               515738 non-null  object 
 1   Additional_Number_of_Scoring                515738 non-null  int64  
 2   Review_Date                                 515738 non-null  object 
 3   Average_Score                               515738 non-null  float64
 4   Hotel_Name                                  515738 non-null  object 
 5   Reviewer_Nationality                        515738 non-null  object 
 6   Negative_Review                             515738 non-null  object 
 7   Review_Total_Negative_Word_Counts           515738 non-null  int64  
 8   Total_Number_of_Reviews                     515738 non-null  int64  
 9   Positive_Review                             515738 non-null  object 
 

In [6]:
# Duplicates handling

# calculating the number of rows before removing duplicates
rows_before = len(hotels)

# checking for and remove full duplicate rows to avoid skewing the analysis
hotels.drop_duplicates(inplace=True)

# calculating the number of rows after removing duplicates
rows_after = len(hotels)

# calculating the number of duplicates removed
duplicates_removed = rows_before - rows_after

# printing the result
if duplicates_removed > 0:
    print(f"{duplicates_removed} duplicates removed.")
else:
    print("No duplicates detected.")


526 duplicates removed.


In [7]:
# printing the unique hotel names
print(hotels['Hotel_Name'].unique())

# counting and printing the total number of unique hotels
total_unique_hotels = hotels['Hotel_Name'].nunique()
print(f"Total number of unique hotels: {total_unique_hotels}")

['Hotel Arena' 'K K Hotel George' 'Apex Temple Court Hotel' ...
 'The Berkeley' 'Holiday Inn London Kensington' 'Atlantis Hotel Vienna']
Total number of unique hotels: 1492


In [8]:
# extracting the country from the hotel address 
# if the last word is "Kingdom", take the two last words instead
hotels['Country'] = hotels['Hotel_Address'].apply(lambda x: " ".join(x.split()[-2:]) if x.split()[-1] == "Kingdom" else x.split()[-1])

# printing the unique countries
unique_countries = hotels['Country'].unique()
print(unique_countries)

# counting the total number of unique countries
total_unique_countries = hotels['Country'].nunique()
print(f"Total number of unique countries: {total_unique_countries}")


['Netherlands' 'United Kingdom' 'France' 'Spain' 'Italy' 'Austria']
Total number of unique countries: 6


In [9]:
# counting how many unique hotels are in each country
unique_hotels_per_country = hotels.groupby('Country')['Hotel_Name'].nunique()

# printing the counts
print(unique_hotels_per_country)


Country
Austria           158
France            458
Italy             162
Netherlands       105
Spain             211
United Kingdom    400
Name: Hotel_Name, dtype: int64


In [10]:
# checking if any latitude values are out of the realistic range of -90 - 90
invalid_latitude = hotels[(hotels['Latitude'] < -90) | (hotels['Latitude'] > 90)]
print(f"Rows with invalid latitude values: {len(invalid_latitude)}")

# checking if any longitude values are out of the realistic range of -180 - 180
invalid_longitude = hotels[(hotels['Longitude'] < -180) | (hotels['Longitude'] > 180)]
print(f"Rows with invalid longitude values: {len(invalid_longitude)}")

Rows with invalid latitude values: 0
Rows with invalid longitude values: 0


In [11]:
# analyzing the distribution of review scores
hotels['Reviewer_Score'].describe()

count    515212.000000
mean          8.395532
std           1.637467
min           2.500000
25%           7.500000
50%           8.800000
75%           9.600000
max          10.000000
Name: Reviewer_Score, dtype: float64

In [12]:
# сonverting Review_Date from string to datetime format
hotels['Review_Date'] = pd.to_datetime(hotels['Review_Date'], errors='coerce')

# checking the first few rows to verify the conversion
hotels['Review_Date'].head()

0   2017-08-03
1   2017-08-03
2   2017-07-31
3   2017-07-31
4   2017-07-24
Name: Review_Date, dtype: datetime64[ns]

In [13]:
# checking for impossible dates
future_dates = hotels[hotels['Review_Date'] > pd.Timestamp.now()]
print(f"Number of future dates: {len(future_dates)}")

very_old_dates = hotels[hotels['Review_Date'] < pd.Timestamp('1990-01-01')]
print(f"Number of very old dates: {len(very_old_dates)}")

Number of future dates: 0
Number of very old dates: 0


In [14]:
# selecting only numerical columns from the original DataFrame
numerical_hotels = hotels.select_dtypes(include=['int64', 'float64'])

# adding the 'Hotel_Name' column to perform groupby
numerical_hotels['Hotel_Name'] = hotels['Hotel_Name']

# creating an additional DataFrame with mean values for each hotel
hotels_unique = numerical_hotels.groupby('Hotel_Name').mean()

# rounding all mean numeric values to one decimal place
hotels_unique = hotels_unique.round(1)

# resetting index to make Hotel_Name a column again
hotels_unique.reset_index(inplace=True)

# checking the first few rows of the new DataFrame to verify
display(hotels_unique.head())


Unnamed: 0,Hotel_Name,Additional_Number_of_Scoring,Average_Score,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Longitude,Latitude
0,11 Cadogan Gardens,101.0,8.7,15.5,393.0,20.0,7.2,8.8,-0.2,51.5
1,1K Hotel,69.0,7.7,24.9,663.0,15.6,9.1,7.9,2.4,48.9
2,25hours Hotel beim MuseumsQuartier,391.0,8.8,16.2,4324.0,21.9,8.7,9.0,16.4,48.2
3,41,66.0,9.6,8.9,244.0,25.3,6.0,9.7,-0.1,51.5
4,45 Park Lane Dorchester Collection,27.0,9.4,6.8,68.0,11.5,7.2,9.6,-0.2,51.5


The difference between Average_Score and the mean of Reviewer_Score reflects the variability in guest experiences and can indicate changes in hotel quality over time. It highlights the importance of considering multiple metrics for a comprehensive evaluation.

In [15]:
# Preparations for the sentiment analysis

import numpy as np

# replacing "No Negative" and "No Positive" text with empty strings to ensure cleaner data for analysis
hotels['Negative_Review'] = hotels['Negative_Review'].replace('No Negative', '')
hotels['Positive_Review'] = hotels['Positive_Review'].replace('No Positive', '')

In [16]:
# Удаление лишних пробелов и специальных символов
hotels['Negative_Review'] = hotels['Negative_Review'].str.strip()
hotels['Positive_Review'] = hotels['Positive_Review'].str.strip()

# Приведение всех текстовых данных к нижнему регистру для унификации
hotels['Negative_Review'] = hotels['Negative_Review'].str.lower()
hotels['Positive_Review'] = hotels['Positive_Review'].str.lower()

In [17]:
# tokenization

tokens = hotels.split()
tokens = [token.lower() for token in tokens if token.isalpha()]

AttributeError: 'DataFrame' object has no attribute 'split'