In [2]:
import pandas as pd
import string
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
hotelDF = pd.read_csv('Hotel_Reviews.csv')
hotelDF.shape

(515738, 17)

In [4]:
#removing unwanted columns as we need hotel name, hotel address, tags and average score for recommendation system
unwanted_columns = ['Additional_Number_of_Scoring', 'Review_Date', 'Reviewer_Score',
       'Reviewer_Nationality',
       'Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Total_Number_of_Reviews', 'Positive_Review',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given','days_since_review', 'lat', 'lng']

hotelDF.drop(columns=unwanted_columns, inplace=True)

hotelDF.columns

Index(['Hotel_Address', 'Average_Score', 'Hotel_Name', 'Tags'], dtype='object')

In [5]:
hotelDF.head()

Unnamed: 0,Hotel_Address,Average_Score,Hotel_Name,Tags
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,"[' Leisure trip ', ' Couple ', ' Duplex Double..."
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,"[' Leisure trip ', ' Couple ', ' Duplex Double..."
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,"[' Leisure trip ', ' Family with young childre..."
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,"[' Leisure trip ', ' Solo traveler ', ' Duplex..."
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,7.7,Hotel Arena,"[' Leisure trip ', ' Couple ', ' Suite ', ' St..."


In [6]:
hotelDF = hotelDF.groupby('Hotel_Name').agg({'Tags': ','.join, 'Hotel_Address': 'first', 'Average_Score': 'first'}).reset_index()

null_counts_all = hotelDF.isnull().sum()
print(null_counts_all)


Hotel_Name       0
Tags             0
Hotel_Address    0
Average_Score    0
dtype: int64


The above output shows there are no null values in the dataframe.

In [7]:
#converting the dataframe to pickeformat since it is faster to read and can handle complex nested structures
hotelDF.to_pickle('hotelDF.pkl')
#next we work on the pickle file

In [8]:
hotel_df = pd.read_pickle('hotelDF.pkl')
hotel_df.head()

Unnamed: 0,Hotel_Name,Tags,Hotel_Address,Average_Score
0,11 Cadogan Gardens,"[' Leisure trip ', ' Couple ', ' Superior Quee...",11 Cadogan Gardens Sloane Square Kensington an...,8.7
1,1K Hotel,"[' Leisure trip ', ' Couple ', ' Superior M Do...",13 Boulevard Du Temple 3rd arr 75003 Paris France,7.7
2,25hours Hotel beim MuseumsQuartier,"[' Leisure trip ', ' Solo traveler ', ' Standa...",Lerchenfelder Stra e 1 3 07 Neubau 1070 Vienna...,8.8
3,41,"[' Leisure trip ', ' Couple ', ' Executive Kin...",41 Buckingham Palace Road Westminster Borough ...,9.6
4,45 Park Lane Dorchester Collection,"[' Leisure trip ', ' Solo traveler ', ' Execut...",45 Park Lane Westminster Borough London W1K 1P...,9.4


Preprocessing the tags column to extract attributes as features that can be used to calculate similarity between hotels.
Extracting tags stored as string into a new column which contains a set of meaningful tags
removing some unnecessary tags such as submitted from a mobile device, which are not useful to help us create a recommender engine


In [9]:
exclude = set(string.punctuation)
def removespaces(hoteltags):
    new_tags = []
    newtags = ""
    for tag in hoteltags[1:][:-1].split(","):
        tag_without_spaces = "".join(character for character in tag.lower() if character not in exclude).replace(' ','')
        new_tags.append(tag_without_spaces)
    
    new_tags.remove('submittedfromamobiledevice')
    newtags = ','.join(new_tags)
    return newtags

hotel_df['new_tags'] = hotel_df['Tags'].map(removespaces)



In [29]:
print(hotel_df['new_tags'])

0       leisuretrip,couple,superiorqueenroom,stayed1ni...
1       leisuretrip,couple,superiormdoubleroom,stayed2...
2       leisuretrip,solotraveler,standarddoubleroom,st...
3       leisuretrip,couple,executivekingroomwithlounge...
4       leisuretrip,solotraveler,executivequeenroom,st...
                              ...                        
1487    leisuretrip,couple,doubleroom,stayed1night,lei...
1488    leisuretrip,solotraveler,doubleroom,stayed3nig...
1489    leisuretrip,solotraveler,doubleroom,stayed4nig...
1490    leisuretrip,couple,standarddoubleroom,stayed5n...
1491    leisuretrip,couple,pentastandardroom,stayed3ni...
Name: new_tags, Length: 1492, dtype: object


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(hotel_df['new_tags'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=hotel_df["Hotel_Name"])

top_tags = tfidf_df.apply(lambda row: row.nlargest(10).index.tolist(), axis=1).to_frame()


In [26]:
print(top_tags)

                                                                                    0
Hotel_Name                                                                           
11 Cadogan Gardens                  [superiorqueenroom, deluxekingroom, leisuretri...
1K Hotel                            [superiormdoubleroom, deluxeldoubleroom, leisu...
25hours Hotel beim MuseumsQuartier  [superiordoubleroomwithcityview, standarddoubl...
41                                  [executivekingroomwithloungeaccess, leisuretri...
45 Park Lane Dorchester Collection  [superiorkingroom, executivequeenroom, parkvie...
...                                                                               ...
citizenM London Bankside            [doubleroom, leisuretrip, couple, submittedfro...
citizenM London Shoreditch          [doubleroom, leisuretrip, stayed1night, submit...
citizenM Tower of London            [doubleroom, leisuretrip, submittedfromamobile...
every hotel Piccadilly              [standarddoubleroo

In [11]:
hotel_df = pd.merge(hotel_df, top_tags, on='Hotel_Name')

In [12]:
hotel_df.head()
hotel_df.rename(columns={0: "Top_10_Tags"}, inplace=True)
hotel_df.drop(columns='Tags', inplace=True)

In [13]:
hotel_df.head()

Unnamed: 0,Hotel_Name,Hotel_Address,Average_Score,new_tags,Top_10_Tags
0,11 Cadogan Gardens,11 Cadogan Gardens Sloane Square Kensington an...,8.7,"leisuretrip,couple,superiorqueenroom,stayed1ni...","[superiorqueenroom, deluxekingroom, leisuretri..."
1,1K Hotel,13 Boulevard Du Temple 3rd arr 75003 Paris France,7.7,"leisuretrip,couple,superiormdoubleroom,stayed2...","[superiormdoubleroom, deluxeldoubleroom, leisu..."
2,25hours Hotel beim MuseumsQuartier,Lerchenfelder Stra e 1 3 07 Neubau 1070 Vienna...,8.8,"leisuretrip,solotraveler,standarddoubleroom,st...","[superiordoubleroomwithcityview, standarddoubl..."
3,41,41 Buckingham Palace Road Westminster Borough ...,9.6,"leisuretrip,couple,executivekingroomwithlounge...","[executivekingroomwithloungeaccess, leisuretri..."
4,45 Park Lane Dorchester Collection,45 Park Lane Westminster Borough London W1K 1P...,9.4,"leisuretrip,solotraveler,executivequeenroom,st...","[superiorkingroom, executivequeenroom, parkvie..."


In [14]:
tag_list = []

def get_tag_elements(tag_string):
    global tag_list
    tag_list.extend(tag_string)
    return True

for i in hotel_df['Top_10_Tags']:
    get_tag_elements(i)

tag_set = set(tag_list)
print(tag_set)
len(tag_set)

{'marinatwinsuite', 'businesstrip', 'roomwithriverviewupperdeck', 'qualitydoubleortwinroomwithextrabed', 'doubleroomdisabilityaccess', 'comforttripleroom', 'classicroomwithgardenview', 'juniorsuitewithshower', 'standarddoubleroomdisabilityaccess', 'doublerooml', 'deluxejuniorsuite', 'deluxedoubleroomwithtwodoublebeds', 'doubleortwinroomwithparkview', 'coolcornersuite', 'onebedroomsuitewithkitchen', 'doubleortwinroomwithview', 'classicroomwithqueenbed', 'onebedfeatureroom', 'classicdoubleroomwithprivateterrace', 'superiordeluxeroom', 'superiordoubleortwinroomwithextrabed', 'superiordoubleroomwithgardenview', 'executivekingroomwithexecutiveloungeaccess', 'superiordoubleroom', 'deluxedoublewithwintergardenview', 'deluxefamilyroomwith1doubleand2singlebeds4adults', 'doubleroomcosy', 'studiowithcourtyardview', 'classicdoubleroomlespop', 'topdeluxeroom', 'juniorsuitewithtwinbeds', 'twinroomwithseaview', 'studiosuite', 'familyroomwithkingbedandsofabed', 'twoconnectingclassicdoublerooms', 'fami

1636

In [15]:
for i in tag_list:
    hotel_df[i] = 0

  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df[i] = 0
  hotel_df

In [16]:
hotel_df.head()

Unnamed: 0,Hotel_Name,Hotel_Address,Average_Score,new_tags,Top_10_Tags,superiorqueenroom,deluxekingroom,leisuretrip,submittedfromamobiledevice,couple,...,largetripleroom,superiordoubleroomwithfreepoolaccess,superiortwinroomwithfreepoolaccess,specialofferdoubleortwinroomwithparking,artkingroom,artroomwithiconicview,arttwinroom,pentastandardroom,pentaplusroom,pentajuniorsuite
0,11 Cadogan Gardens,11 Cadogan Gardens Sloane Square Kensington an...,8.7,"leisuretrip,couple,superiorqueenroom,stayed1ni...","[superiorqueenroom, deluxekingroom, leisuretri...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1K Hotel,13 Boulevard Du Temple 3rd arr 75003 Paris France,7.7,"leisuretrip,couple,superiormdoubleroom,stayed2...","[superiormdoubleroom, deluxeldoubleroom, leisu...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,25hours Hotel beim MuseumsQuartier,Lerchenfelder Stra e 1 3 07 Neubau 1070 Vienna...,8.8,"leisuretrip,solotraveler,standarddoubleroom,st...","[superiordoubleroomwithcityview, standarddoubl...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,41,41 Buckingham Palace Road Westminster Borough ...,9.6,"leisuretrip,couple,executivekingroomwithlounge...","[executivekingroomwithloungeaccess, leisuretri...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,45 Park Lane Dorchester Collection,45 Park Lane Westminster Borough London W1K 1P...,9.4,"leisuretrip,solotraveler,executivequeenroom,st...","[superiorkingroom, executivequeenroom, parkvie...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
only_top_10_tags=pd.DataFrame(hotel_df.Top_10_Tags.loc[:])

print(only_top_10_tags)

for index, row in only_top_10_tags.iterrows():
    top10tagset = set(row['Top_10_Tags'])
    
    for tag in top10tagset:
        #set corresponding column to 1
        if tag in tag_set:
            hotel_df.loc[index,tag]=1

                                            Top_10_Tags
0     [superiorqueenroom, deluxekingroom, leisuretri...
1     [superiormdoubleroom, deluxeldoubleroom, leisu...
2     [superiordoubleroomwithcityview, standarddoubl...
3     [executivekingroomwithloungeaccess, leisuretri...
4     [superiorkingroom, executivequeenroom, parkvie...
...                                                 ...
1487  [doubleroom, leisuretrip, couple, submittedfro...
1488  [doubleroom, leisuretrip, stayed1night, submit...
1489  [doubleroom, leisuretrip, submittedfromamobile...
1490  [standarddoubleroom, leisuretrip, deluxekingro...
1491  [pentastandardroom, pentaplusroom, leisuretrip...

[1492 rows x 1 columns]


In [18]:

#swap the position of avg score column and top10tags column

cols = list(hotel_df.columns)
avgscore_index, top10tags_index = cols.index('Average_Score'), cols.index('Top_10_Tags')
cols[avgscore_index], cols[top10tags_index] = cols[top10tags_index], cols[avgscore_index]

hotel_df = hotel_df[cols]

In [19]:
hotel_df.head()

Unnamed: 0,Hotel_Name,Hotel_Address,Top_10_Tags,new_tags,Average_Score,superiorqueenroom,deluxekingroom,leisuretrip,submittedfromamobiledevice,couple,...,largetripleroom,superiordoubleroomwithfreepoolaccess,superiortwinroomwithfreepoolaccess,specialofferdoubleortwinroomwithparking,artkingroom,artroomwithiconicview,arttwinroom,pentastandardroom,pentaplusroom,pentajuniorsuite
0,11 Cadogan Gardens,11 Cadogan Gardens Sloane Square Kensington an...,"[superiorqueenroom, deluxekingroom, leisuretri...","leisuretrip,couple,superiorqueenroom,stayed1ni...",8.7,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1K Hotel,13 Boulevard Du Temple 3rd arr 75003 Paris France,"[superiormdoubleroom, deluxeldoubleroom, leisu...","leisuretrip,couple,superiormdoubleroom,stayed2...",7.7,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,25hours Hotel beim MuseumsQuartier,Lerchenfelder Stra e 1 3 07 Neubau 1070 Vienna...,"[superiordoubleroomwithcityview, standarddoubl...","leisuretrip,solotraveler,standarddoubleroom,st...",8.8,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,41,41 Buckingham Palace Road Westminster Borough ...,"[executivekingroomwithloungeaccess, leisuretri...","leisuretrip,couple,executivekingroomwithlounge...",9.6,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,45 Park Lane Dorchester Collection,45 Park Lane Westminster Borough London W1K 1P...,"[superiorkingroom, executivequeenroom, parkvie...","leisuretrip,solotraveler,executivequeenroom,st...",9.4,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [20]:
similarityDF = pd.DataFrame(cosine_similarity(hotel_df.iloc[:, 4:], hotel_df.iloc[:,4:]))

In [21]:
print(similarityDF)

          0         1         2         3         4         5         6     \
0     1.000000  0.960224  0.942230  0.956785  0.956139  0.939756  0.953783   
1     0.960224  1.000000  0.934765  0.949904  0.949425  0.932638  0.947612   
2     0.942230  0.934765  1.000000  0.957320  0.945876  0.928398  0.942818   
3     0.956785  0.949904  0.957320  1.000000  0.960077  0.944010  0.957320   
4     0.956139  0.949425  0.945876  0.960077  1.000000  0.943199  0.945876   
...        ...       ...       ...       ...       ...       ...       ...   
1487  0.966259  0.961065  0.955542  0.969059  0.968552  0.941848  0.955542   
1488  0.966259  0.961065  0.955542  0.969059  0.968552  0.941848  0.955542   
1489  0.966259  0.961065  0.955542  0.969059  0.968552  0.941848  0.955542   
1490  0.965336  0.947612  0.965691  0.967900  0.956659  0.940313  0.954254   
1491  0.952326  0.946453  0.952771  0.966552  0.955028  0.938568  0.952771   

          7         8         9     ...      1482      1483    

In [22]:
def new_recommended_hotels(name, cosine_similarity):
    recommended_hotels = []
    hotel_index = hotel_df[(hotel_df.Hotel_Name == name)].index[0]

    similarity_scores = similarityDF[hotel_index]
    sorted_scores = similarity_scores.sort_values(ascending=False)

    top_index = list(sorted_scores.index)

    for i in range(1,11):
        recommended_hotels.append(hotel_df[hotel_df.index == top_index[i]]['Hotel_Name'].values[0])
    return recommended_hotels

In [23]:
new_recommended_hotels('Hotel Arena',cosine_similarity)

['Mercer Hotel Barcelona',
 'The Wittmore Adults Only',
 'The Guesthouse Vienna',
 'H tel D Aubusson',
 'Mill sime H tel',
 'Maison Souquet',
 'The Harmonie Vienna',
 'Grand H tel Du Palais Royal',
 'Primero Primera',
 'Residence Henri IV']

In [31]:
hotel_input = input('Enter name of hotel: ')
print('Hotels recommended to a user who liked ', hotel_input)
new_recommended_hotels(hotel_input,cosine_similarity)

Hotels recommended to a user who liked  Mercer Hotel Barcelona


['H tel D Aubusson',
 'Baglioni Hotel Carlton The Leading Hotels of the World',
 'Hotel Topazz',
 'Hotel Lam e',
 'Covent Garden Hotel',
 'The Guesthouse Vienna',
 'Nolinski Paris',
 'Monument Hotel',
 'Canal House',
 'Mill sime H tel']