# Hotel Recommendation - Item-based Collaborative Filtering

In [48]:
import pandas as pd
import numpy as np
import ast
import seaborn as sns

## 1. Data Preparation
- Exploring customer information (i.e. rating, author, date(s) stayed, # of days stayed) and hotel information.
- The tables were merged to give a comprehensive map of the customer, hotel and rating given to it.

In [49]:
offerings = pd.read_csv('./dataset/offerings.csv')
reviews = pd.read_csv('./dataset/reviews.csv')

In [50]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878561 entries, 0 to 878560
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   ratings            878561 non-null  object
 1   title              878561 non-null  object
 2   text               878561 non-null  object
 3   author             878561 non-null  object
 4   date_stayed        810967 non-null  object
 5   offering_id        878561 non-null  int64 
 6   num_helpful_votes  878561 non-null  int64 
 7   date               878561 non-null  object
 8   id                 878561 non-null  int64 
 9   via_mobile         878561 non-null  bool  
dtypes: bool(1), int64(3), object(6)
memory usage: 61.2+ MB


In [51]:
offerings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4333 entries, 0 to 4332
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   hotel_class  3141 non-null   float64
 1   region_id    4333 non-null   int64  
 2   url          4333 non-null   object 
 3   phone        0 non-null      float64
 4   details      0 non-null      float64
 5   address      4333 non-null   object 
 6   type         4333 non-null   object 
 7   id           4333 non-null   int64  
 8   name         4333 non-null   object 
dtypes: float64(3), int64(2), object(4)
memory usage: 304.8+ KB


In [52]:
offerings

Unnamed: 0,hotel_class,region_id,url,phone,details,address,type,id,name
0,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '147 West 4...",hotel,113317,Casablanca Hotel Times Square
1,5.0,32655,http://www.tripadvisor.com/Hotel_Review-g32655...,,,"{'region': 'CA', 'street-address': '300 S Dohe...",hotel,76049,Four Seasons Hotel Los Angeles at Beverly Hills
2,3.5,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '790 Eighth...",hotel,99352,Hilton Garden Inn Times Square
3,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '152 West 5...",hotel,93589,The Michelangelo Hotel
4,4.0,60763,http://www.tripadvisor.com/Hotel_Review-g60763...,,,"{'region': 'NY', 'street-address': '130 West 4...",hotel,217616,The Muse Hotel New York
...,...,...,...,...,...,...,...,...,...
4328,5.0,28970,http://www.tripadvisor.com/Hotel_Review-g28970...,,,"{'region': 'DC', 'street-address': '1201 24th ...",hotel,84090,Park Hyatt Washington
4329,5.0,28970,http://www.tripadvisor.com/Hotel_Review-g28970...,,,"{'region': 'DC', 'street-address': '2800 Penns...",hotel,84065,Four Seasons Washington D.C.
4330,4.0,28970,http://www.tripadvisor.com/Hotel_Review-g28970...,,,"{'region': 'DC', 'street-address': '2121 P Str...",hotel,84093,"Palomar Washington DC, a Kimpton Hotel"
4331,4.5,28970,http://www.tripadvisor.com/Hotel_Review-g28970...,,,"{'region': 'DC', 'street-address': '806 15th S...",hotel,235513,Sofitel Washington DC


In [53]:
reviews = reviews[["ratings", "author", "offering_id"]]
offerings = offerings[["id", "name"]]
reviews

Unnamed: 0,ratings,author,offering_id
0,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","{'username': 'Papa_Panda', 'num_cities': 22, '...",93338
1,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","{'username': 'Maureen V', 'num_reviews': 2, 'n...",93338
2,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...","{'username': 'vuguru', 'num_cities': 12, 'num_...",1762573
3,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...","{'username': 'Hotel-Designer', 'num_cities': 5...",1762573
4,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...","{'username': 'JamesE339', 'num_cities': 34, 'n...",1762573
...,...,...,...
878556,{'overall': 4.0},"{'username': '', 'id': '', 'location': ''}",84093
878557,"{'service': 4.0, 'cleanliness': 5.0, 'overall'...","{'username': 'dan016', 'num_reviews': 2, 'num_...",84093
878558,"{'cleanliness': 5.0, 'overall': 5.0, 'rooms': ...","{'username': '', 'id': '', 'location': ''}",84093
878559,"{'cleanliness': 5.0, 'overall': 5.0, 'rooms': ...","{'username': '', 'id': '', 'location': ''}",84093


In [54]:
reviews["ratings"] = reviews["ratings"].apply(ast.literal_eval)
reviews["author"] = reviews["author"].apply(ast.literal_eval)

In [55]:
reviews["ratings"] = reviews["ratings"].transform(lambda x: x["overall"])
reviews["author"] = reviews["author"].transform(lambda x: x["username"])

In [56]:
# Final dataset
reviews.head()

Unnamed: 0,ratings,author,offering_id
0,5.0,Papa_Panda,93338
1,5.0,Maureen V,93338
2,4.0,vuguru,1762573
3,4.0,Hotel-Designer,1762573
4,4.0,JamesE339,1762573


In [62]:
N = 500
hotel_reviews = pd.merge(reviews, offerings, left_on="offering_id", right_on="id").drop(columns=["offering_id", "id"])
hotel_reviews = hotel_reviews.sample(N)

In [63]:
review_matrix = pd.pivot_table(hotel_reviews, index="author", columns="name", values="ratings")

In [64]:
# Review Matrix (columns - hotels, rows - customers) & fill NaN with 1 (neutral rating)
review_matrix[review_matrix.isna()] = 0
review_matrix.head()

name,6 Columbus - A Thompson Hotel,Affinia 50,Affinia Manhattan,Affinia Shelburne,Amalfi Hotel Chicago,Americas Best Value Inn & Suites-SOMA,Andaz Wall Street,Andrews Hotel,Argonaut Hotel - a Kimpton Hotel,Arizona Biltmore,...,Waldorf Astoria New York,Warwick New York Hotel,Warwick Seattle Hotel,Washington Plaza,Westin San Diego,"White Swan Inn, a Joie de Vivre Hotel",Wolcott Hotel,World Center Hotel,Wyndham Garden Austin,"nyma, the New York Manhattan Hotel"
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18kimmr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1978thetrav,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2harriedmom,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AMS137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Recommendation Function

The function takes te user's highest rated hotel, calculates the cosine similarity across the user-rating matrix to recommend the most "similarly" rated hotel.

In [65]:
def recommend_hotels(user_id, k):
    rating = review_matrix.iloc[user_id].values

    rated_hotel_idx = rating.argmax()   
    hotel_names = pd.DataFrame(review_matrix.unstack().reset_index()["name"].unique())

    best_hotel = hotel_names.iloc[rated_hotel_idx].values
    rated_hotel = review_matrix.iloc[:,rated_hotel_idx]
    
    cosine_similarity = np.dot(review_matrix.T, rated_hotel) / ((np.linalg.norm(review_matrix.T, axis=1) * np.linalg.norm(rated_hotel)))
    closest_idx = pd.DataFrame(cosine_similarity).sort_values(by=0, ascending=False).iloc[1:k+1].index
    recommended = review_matrix.columns[closest_idx]

    return best_hotel, recommended

In [66]:
# Results
K= 10
for i in range(1,K):
    best_hotel, recommended = recommend_hotels(i, 5)
    print(f"User {i} - Best Hotel: {best_hotel}, Recommended Hotels: {recommended}.")
    print(" ")

User 1 - Best Hotel: ['Fairfield Inn Denver Airport'], Recommended Hotels: Index(['Beekman Tower Hotel', 'Trump SoHo New York',
       'Trump International Hotel and Tower',
       'Americas Best Value Inn & Suites-SOMA',
       'nyma, the New York Manhattan Hotel'],
      dtype='object', name='name').
 
User 2 - Best Hotel: ['Super 8 Motel Los Angeles Downtown'], Recommended Hotels: Index(['Trump SoHo New York', 'Trump International Hotel and Tower',
       'Americas Best Value Inn & Suites-SOMA',
       'nyma, the New York Manhattan Hotel', 'Amalfi Hotel Chicago'],
      dtype='object', name='name').
 
User 3 - Best Hotel: ['Le Parker Meridien'], Recommended Hotels: Index(['Trump SoHo New York', 'Trump International Hotel and Tower',
       'Americas Best Value Inn & Suites-SOMA',
       'nyma, the New York Manhattan Hotel', 'Amalfi Hotel Chicago'],
      dtype='object', name='name').
 
User 4 - Best Hotel: ['Park Central'], Recommended Hotels: Index(['Four Seasons Hotel San Francisc