In [76]:
import pandas as pd
import numpy as np  

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split

from collections import defaultdict
import os

#pd.set_option('display.float_format', lambda x: '%.0f' % x)
pd.set_option('display.float_format', lambda x: '%d' % x)

In [77]:
# Information about individual channels/hotels
data_lake_prd_314410_cz_canais = pd.read_csv('../data/lookups/data-lake-prd-314410.cz.canais.csv')
data_lake_prd_314410_cz_hoteis = pd.read_csv('../data/lookups/data-lake-prd-314410.cz.hoteis.csv')
data_lake_prd_314410_cz_cidades = pd.read_csv('../data/lookups/data-lake-prd-314410.cz.cidades.csv')


In [78]:
# List of hotel-channel combinations as of January 2025
hotel_city_chanel_combin_extract  = pd.read_csv('../data/other/hotel_city_chanel_combin_extract.csv')
hotel_city_chanel_combin_extract.dropna(inplace=True)
hotel_city_chanel_combin_extract.drop(columns=['Cidade_ID'], inplace=True)
hotel_city_chanel_combin_extract.drop_duplicates(inplace=True)

In [79]:
print(hotel_city_chanel_combin_extract.nunique())
print(hotel_city_chanel_combin_extract.shape)
print(732*14829)

Canal_ID      732
Hotel_ID    14829
dtype: int64
(333652, 2)
10854828


In [80]:
hotel_city_chanel_combin_extract['Hotel_ID'].max()

1843418434.0

In [81]:
hotel_city_chanel_combin_extract = hotel_city_chanel_combin_extract.merge(
    data_lake_prd_314410_cz_hoteis[['Hotel_ID', 'StatusHotel']],
    on='Hotel_ID',
    how='left'
)

In [82]:
hotel_city_chanel_combin_extract['StatusHotel'].value_counts()

StatusHotel
3. Ativo             255600
4. Inativo            47988
1. Demo/Teste         18288
2. Em Implantação       524
Name: count, dtype: int64

In [83]:
hotel_city_chanel_combin_extract = hotel_city_chanel_combin_extract[ 
    (hotel_city_chanel_combin_extract['StatusHotel'] == '3. Ativo') | 
    (hotel_city_chanel_combin_extract['StatusHotel'] == '4. Inativo')]

In [84]:
hotel_city_chanel_combin_extract['StatusHotel'].value_counts()

StatusHotel
3. Ativo      255600
4. Inativo     47988
Name: count, dtype: int64

In [85]:
hotel_city_chanel_combin_extract = hotel_city_chanel_combin_extract.drop(columns=['StatusHotel'])

In [86]:
hotel_city_chanel_combin_extract

Unnamed: 0,Canal_ID,Hotel_ID
0,252,11332
1,124,16573
2,676,12517
3,104,5882
4,627,5124
...,...,...
333647,1023,15882
333648,119,19278
333649,942,15845
333650,677,16184


In [87]:
unique_hotels_lookupID = data_lake_prd_314410_cz_hoteis['Hotel_ID'].unique()
unique_chanels_lookupID = data_lake_prd_314410_cz_canais['Canal_ID'].unique()

In [88]:
unique_hotels_lookupID

array([ 3728,  3765,  4876, ..., 20764, 20366, 20822], dtype=int64)

In [89]:
hotel_city_chanel_combin_extract['Canal_ID'] = hotel_city_chanel_combin_extract['Canal_ID'].astype('int64')
hotel_city_chanel_combin_extract['Hotel_ID'] = hotel_city_chanel_combin_extract['Hotel_ID'].astype('int64')

In [90]:
hotel_city_chanel_combin_extract = hotel_city_chanel_combin_extract[
    hotel_city_chanel_combin_extract['Hotel_ID'].isin(unique_hotels_lookupID)
]

hotel_city_chanel_combin_extract = hotel_city_chanel_combin_extract[
    hotel_city_chanel_combin_extract['Canal_ID'].isin(unique_chanels_lookupID)
]


In [91]:
hotel_city_chanel_combin_extract['Hotel_ID'].max()

20831

In [92]:
print(hotel_city_chanel_combin_extract.nunique())
print(hotel_city_chanel_combin_extract.shape)
print(720*6988)

Canal_ID      725
Hotel_ID    10688
dtype: int64
(303587, 2)
5031360


# Singular Value Decomposition (SVD) 

In [93]:
# Pivot the table
pivot_table = hotel_city_chanel_combin_extract.pivot_table(index='Hotel_ID', columns='Canal_ID', aggfunc='size', fill_value=0)

# Convert the table to binary (1 where the combination existed, 0 otherwise)
pivot_table = pivot_table.map(lambda x: 1 if x > 0 else 0)

In [94]:
# Count the number of 1s for each column
counts_per_channel = pivot_table.sum().sort_values(ascending=False)

In [95]:
counts_per_channel.sort_values(ascending=False)

Canal_ID
124     2123
908     2107
1195    2043
157     2041
1145    2022
        ... 
311        1
253        1
267        1
1094       1
1218       1
Length: 725, dtype: int64

In [96]:
df

Canal_ID,1,2,6,7,8,10,30,31,32,34,...,1372,1374,1375,1376,1377,1381,1382,1387,1389,1390
Hotel_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20817,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20824,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20825,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
# Step 1: Prepare the data matrix

# Loops through the rows (hotels) and columns (channels) of the wide matrix above
# Extracts ratings from the DataFrame and stores them as (hotel, channel, rating) tuples
# Creates a new Pandas DataFrame (ratings_df) with three columns:
# -hotel: The identifier of the hotel.
# -channel: The distribution channel (e.g., Expedia, Booking.com, etc.).
# -rating: The rating or score between 0 and 1.
# Prepares the data for the Surprise library. Reader(rating_scale=(0, 1)) tells Surprise that ratings range from 0 to 1.
# Dataset.load_from_df(ratings_df, reader) converts the DataFrame into a Surprise Dataset.


def prepare_data(df):
    ratings = []
    for hotel in df.index:
        for channel in df.columns:
            ratings.append((hotel, channel, df.loc[hotel, channel]))

    ratings_df = pd.DataFrame(ratings, columns=['hotel', 'channel', 'rating'])
    reader = Reader(rating_scale=(0, 1))
    return Dataset.load_from_df(ratings_df, reader)


In [98]:
# Step 2: Train the model

# Splits the Data into Training & Test Sets
# model = SVD(n_factors=100, n_epochs=100, lr_all=0.005, reg_all=0.02)
# SVD is a matrix factorization model used in collaborative filtering.
#Parameters:
# n_factors=100 → Number of latent factors (hidden features) in the model.
# n_epochs=100 → Number of training iterations.
# lr_all=0.005 → Learning rate for gradient descent.
# reg_all=0.02 → Regularization term to prevent overfitting.


def train_model(data):
    trainset, testset = train_test_split(data, test_size=0.25, random_state=42)
    model = SVD(n_factors=100, n_epochs=500, lr_all=0.005, reg_all=0.02)
    model.fit(trainset)
    return model, trainset

In [99]:
# Step 3: Generate recommendations

# Processes the prediction results from the SVD model and extracts the top N recommendations for each hotel.
# Creates a Dictionary to Store Recommendations
# top_n is a dictionary where:
# -Keys = uid (hotel ID).
# -Values = A list of tuples (iid, est), where:
# -iid = channel ID.
# -est = predicted rating.
# Processes Predictions and Stores Estimated Ratings
# for uid, iid, true_r, est, _ in predictions:
#    top_n[uid].append((iid, est))
# The predictions list contains tuples with:
# -uid: Hotel ID
# -iid: Channel ID
# -true_r: Actual rating (ignored in this function)
# -est: Predicted rating (used for ranking)
# Stores (iid, est) in top_n[uid] for each hotel.
# Sorts Channels by Predicted Rating in Descending Order
#  for uid, user_ratings in top_n.items():
#     user_ratings.sort(key=lambda x: x[1], reverse=True)
#      top_n[uid] = user_ratings[:n]
# Sorts the channels for each hotel based on estimated rating (est).
# Keeps only the top N channels with the highest predicted ratings.
# Returns the Dictionary with Top N Recommendations

def get_top_n_recommendations(predictions):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings
    
    return top_n

In [100]:
# Step 4: Recommend channels

# Generates top N recommended channels for a given hotel using a trained SVD model.
# Extracts All Available Channels from the Dataset
# Creates a Test Set for the Given Hotel
# Generates a list of test samples where:
# -hotel_id: The hotel we want recommendations for.
# -iid: Each possible channel.
# -0: A placeholder rating (it will be predicted).
# Uses the SVD model to predict ratings for all channels.
# Returns the Top N Channels for the Hotel

def recommend_channels(hotel_id, model, data):
    iids = data.df['channel'].unique()
    testset = [(hotel_id, iid, 0) for iid in iids]
    predictions = model.test(testset)
    top_n = get_top_n_recommendations(predictions)
    return top_n[hotel_id]

In [101]:
# Main execution

df = pivot_table
data = prepare_data(df)
model, trainset = train_model(data)


In [102]:
# Example usage for 1 hotel
hotel_id = df.index[10687]  # Choose a hotel

recommendations = recommend_channels(hotel_id, model, data)

print(f"Recommended channels for hotel {int(hotel_id)}:")

for channel, score in recommendations:
    print(f"{channel}: {score:.4f}")

Recommended channels for hotel 20831:
157: 0.1804
1145: 0.1670
255: 0.1571
124: 0.1529
1195: 0.1502
908: 0.1445
201: 0.1424
1210: 0.1419
833: 0.1402
810: 0.1400
607: 0.1380
1168: 0.1349
608: 0.1312
1234: 0.1305
357: 0.1301
310: 0.1212
104: 0.1205
225: 0.1133
889: 0.1123
1351: 0.1116
440: 0.1087
71: 0.1063
252: 0.1059
315: 0.1048
903: 0.1010
1: 0.0985
269: 0.0984
1070: 0.0964
396: 0.0930
256: 0.0905
399: 0.0889
646: 0.0874
289: 0.0870
92: 0.0858
76: 0.0856
172: 0.0839
292: 0.0835
418: 0.0834
677: 0.0833
89: 0.0831
368: 0.0826
346: 0.0817
532: 0.0812
812: 0.0807
314: 0.0800
127: 0.0798
1343: 0.0793
1181: 0.0791
1199: 0.0786
340: 0.0780
302: 0.0774
994: 0.0759
1102: 0.0753
672: 0.0751
330: 0.0746
221: 0.0741
358: 0.0736
125: 0.0730
277: 0.0729
128: 0.0711
594: 0.0699
938: 0.0697
533: 0.0696
390: 0.0693
826: 0.0668
320: 0.0663
1298: 0.0658
937: 0.0651
558: 0.0651
1349: 0.0644
270: 0.0643
1346: 0.0639
142: 0.0639
964: 0.0633
175: 0.0602
1129: 0.0596
976: 0.0589
1311: 0.0577
228: 0.0576
739:

In [103]:

def recommend_channels_exclude_existing(hotel_id, model, data, existing_channels):
    # Get unique channel IDs from the data
    iids = data.df['channel'].unique()
    
    # Generate test set for the given hotel
    testset = [(hotel_id, iid, 0) for iid in iids]
    
    # Get predictions for the test set
    predictions = model.test(testset)
    
    # Get top N recommendations
    top_n = get_top_n_recommendations(predictions)
    
    # Get the list of channels that the hotel already has
    existing_hotel_channels = existing_channels[existing_channels['Hotel_ID'] == hotel_id]['Canal_ID'].values
    
    # Exclude the channels that are already associated with the hotel
    filtered_recommendations = [rec for rec in top_n[hotel_id] if rec[0] not in existing_hotel_channels]
    
    return filtered_recommendations

In [104]:
# Create a dictionary to store the recommendations for each hotel
recommendations_dict = {}

# Loop through each hotel in df and get the all recommended channels
for hotel_id in df.index:
    recommendations = recommend_channels_exclude_existing(hotel_id, model, data, hotel_city_chanel_combin_extract)
    
    # Store the recommendations in the dictionary
    recommendations_dict[hotel_id] = recommendations

In [105]:
len(recommendations_dict.keys())

10688

In [106]:
# Count the number of recommendations per hotel and sort by count
recommendation_counts = {hotel_id: len(recs) for hotel_id, recs in recommendations_dict.items()}
sorted_counts = sorted(recommendation_counts.items(), key=lambda x: x[1], reverse=False)
sorted_counts

[(13992, 617),
 (7408, 621),
 (7415, 622),
 (4484, 627),
 (6846, 627),
 (10970, 627),
 (7416, 628),
 (10385, 629),
 (3139, 630),
 (17285, 630),
 (4921, 631),
 (5098, 631),
 (5411, 632),
 (10666, 632),
 (5392, 634),
 (8450, 634),
 (9234, 634),
 (17686, 634),
 (5512, 635),
 (5513, 635),
 (8621, 635),
 (4485, 636),
 (6215, 636),
 (17277, 636),
 (5208, 637),
 (7410, 637),
 (8950, 637),
 (4489, 638),
 (6580, 638),
 (18586, 638),
 (4013, 639),
 (12271, 639),
 (16701, 639),
 (2181, 640),
 (4488, 640),
 (5514, 640),
 (18046, 640),
 (1112, 641),
 (4523, 641),
 (5354, 641),
 (19088, 641),
 (6036, 642),
 (6756, 642),
 (7805, 642),
 (8991, 642),
 (9470, 642),
 (4273, 643),
 (4893, 643),
 (5975, 643),
 (7364, 643),
 (10361, 643),
 (16653, 643),
 (17284, 643),
 (2973, 644),
 (3128, 644),
 (3769, 644),
 (5054, 644),
 (6782, 644),
 (13154, 644),
 (17459, 644),
 (1485, 645),
 (3048, 645),
 (6632, 645),
 (10667, 645),
 (13264, 645),
 (13671, 645),
 (14140, 645),
 (17969, 645),
 (2080, 646),
 (3617, 646)

In [107]:
import pickle

# Save the recommendations_dict using pickle
# Pickle serializes (converts) Python objects into a binary format for storage or transfer 
# Then deserializes (restores) them back to their original form when needed. 
# Serialization (Pickling): The process of converting a Python object into a byte stream (binary data) that can be saved to a file or sent over a network.
# Deserialization (Unpickling): The process of reading a byte stream (binary data) and converting it back into a Python object.
# Pickle uses a binary format to represent Python objects (not human-readable).

with open('../out/recommendations_dict.pkl', 'wb') as f:
    pickle.dump(recommendations_dict, f)

In [108]:
# Load the recommendations_dict using pickle
with open('../out/recommendations_dict.pkl', 'rb') as f:
    loaded_recommendations_dict = pickle.load(f)

In [109]:
flattened_data = []

for hotel_id, recommendations in recommendations_dict.items():
    for channel_ID, score in recommendations:
        flattened_data.append({
            'Hotel_ID': hotel_id,
            'Channel_ID': channel_ID,
            'Score': score  
        })


flattened_data = pd.DataFrame(flattened_data)

In [110]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

flattened_data

Unnamed: 0,Hotel_ID,Channel_ID,Score
0,7,157,0.1799
1,7,1145,0.1664
2,7,255,0.1565
3,7,124,0.1528
4,7,1195,0.1497
...,...,...,...
7445208,20831,1381,0.0000
7445209,20831,1382,0.0000
7445210,20831,1387,0.0000
7445211,20831,1389,0.0000


In [111]:
flattened_data['Hotel_ID'] = flattened_data['Hotel_ID'].astype(int)
flattened_data['Channel_ID'] = flattened_data['Channel_ID'].astype(int)

In [112]:
flattened_data.to_csv('../out/svd_hotel_channel_recommendations_df_all.csv', index=False)

## Check Recommendations based on hotel similarity (original SVD or cosine similarity)

In [113]:
flattened_data = pd.read_csv('../out/svd_hotel_channel_recommendations_df_all.csv')

# SVD similariy

In [115]:
def compute_hotel_similarity_from_model(model, trainset):
    hotel_inner_ids = list(trainset.all_users())
    hotel_vectors = np.array([model.pu[uid] for uid in hotel_inner_ids])
    sim_matrix = cosine_similarity(hotel_vectors)

    # Map internal IDs back to hotel IDs
    hotel_ids = [trainset.to_raw_uid(uid) for uid in hotel_inner_ids]
    sim_df = pd.DataFrame(sim_matrix, index=hotel_ids, columns=hotel_ids)
    return sim_df

In [118]:
similarity_df = compute_hotel_similarity_from_model(model, trainset)
top10 = similarity_df.loc[hotel_id].sort_values(ascending=False).head(10)  # top 10 most similar hotels
bottom10 = similarity_df.loc[hotel_id].sort_values(ascending=True).head(10)  # top 10 most similar hotels

# Cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def compute_hotel_similarity(pivot_table):
    """
    Computes cosine similarity between all hotels in the pivot table.
    
    Parameters:
    - pivot_table: DataFrame with Hotel_ID as index, Canal_IDs as binary columns
    
    Returns:
    - similarity_df: DataFrame with cosine similarity values between hotels
    """

    similarity_matrix = cosine_similarity(pivot_table)
    similarity_df = pd.DataFrame(similarity_matrix, 
                                 index=pivot_table.index, 
                                 columns=pivot_table.index)
    return similarity_df

In [None]:
def get_closest_hotels(hotel_id, similarity_df, top_n=10, direction="top"):
    """
    Returns the top_n most similar or least similar hotels to the given hotel_id.
    
    Parameters:
    - hotel_id: The ID of the hotel for which we want recommendations
    - similarity_df: DataFrame with cosine similarity values between hotels
    - top_n: Number of similar hotels to return (top or bottom)
    - direction: "top" for most similar, "bottom" for least similar
    
    Returns:
    - closest_hotels: List of hotel IDs of the most or least similar hotels
    """
    # Get the similarity scores for the given hotel_id
    similarity_scores = similarity_df[hotel_id].sort_values(ascending=False)
    
    # Exclude the hotel itself (first entry)
    similarity_scores = similarity_scores.iloc[1:]
    
    if direction == "top":
        # Get the top N most similar hotels (highest similarity)
        closest_hotels = similarity_scores.head(top_n)
    elif direction == "bottom":
        # Get the bottom N least similar hotels (lowest similarity)
        closest_hotels = similarity_scores.tail(top_n)
    else:
        raise ValueError("Direction must be 'top' or 'bottom'")
    
    return closest_hotels.index.tolist(), closest_hotels.values.tolist()


In [None]:
similarity_df = compute_hotel_similarity(pivot_table)

# Example: Get the top 10 closest hotels for hotel with ID '2'
hotel_id = 6597  # Example hotel ID
top_n = 25
similar_hotels, similarity_values = get_closest_hotels(hotel_id, similarity_df, top_n, 'top')
print("Most similar hotels to hotel", hotel_id, ":", similar_hotels)


In [None]:
def get_top_similar_hotels(similarity_df, hotel_id, top_n=5):
    """
    Get the top-N most similar hotels to a given hotel.
    
    Parameters:
    - similarity_df: DataFrame from compute_svd_similarity
    - hotel_id: raw hotel ID
    - top_n: number of similar hotels to return

    Returns:
    - Series of top similar hotel IDs with similarity scores
    """
    if hotel_id not in similarity_df.index:
        raise ValueError("Hotel ID not found in similarity matrix")
    
    similarities = similarity_df.loc[hotel_id].drop(hotel_id)  # exclude self
    return similarities.sort_values(ascending=False).head(top_n)

In [121]:
def get_channel_recommendations(hotel_id, flattened_data):
    """
    Returns the top 100 recommended channels for a given hotel.
    
    Parameters:
    - hotel_id: The ID of the hotel
    - flattened_data: DataFrame with Hotel_ID, Channel_ID, and Score columns
    
    Returns:
    - recommended_channels: Set of Channel_IDs recommended for the hotel
    """
     # Filter recommendations for the given hotel
    hotel_data = flattened_data[flattened_data['Hotel_ID'] == hotel_id]
    
    # Sort by score in descending order and take top 100
    top_channels = hotel_data.sort_values(by='Score', ascending=False).head(100)
    
    # Convert to set of Channel_IDs
    recommended_channels = set(top_channels['Channel_ID'])
    
    return recommended_channels

In [None]:
# Example: Get channel recommendations for the input hotel and closest hotels
input_hotel = hotel_id  # Example hotel ID

closest_hotels = bottom10.index.tolist() # similar_hotels  <- change to this to keep old definiton # Example closest hotels

input_hotel_channels = get_channel_recommendations(input_hotel, flattened_data)

closest_hotels_channels = {
    hotel: get_channel_recommendations(hotel, flattened_data) for hotel in closest_hotels
}

# Print the channel recommendations for the input hotel and its closest hotels
print("Input Hotel Channels:", input_hotel_channels)
for hotel, channels in closest_hotels_channels.items():
    print(f"Channels for Hotel {hotel}: {channels}")

Input Hotel Channels: {1024, 1, 533, 554, 1070, 558, 1093, 71, 76, 1102, 594, 1108, 89, 92, 607, 608, 104, 1129, 626, 627, 1145, 635, 124, 125, 127, 128, 646, 136, 142, 1168, 157, 1181, 672, 677, 1195, 172, 1199, 175, 1200, 1210, 201, 209, 1234, 221, 225, 1249, 739, 228, 231, 763, 252, 255, 256, 269, 270, 1298, 277, 791, 1311, 289, 292, 805, 810, 812, 302, 310, 314, 315, 826, 1343, 320, 833, 1346, 1349, 1351, 330, 340, 346, 865, 357, 358, 368, 889, 384, 390, 903, 908, 396, 399, 408, 414, 418, 937, 938, 940, 440, 964, 976, 987, 994}
Channels for Hotel 6193: {1024, 1, 532, 533, 1070, 558, 1093, 1102, 594, 1108, 597, 87, 89, 92, 607, 104, 1129, 626, 627, 115, 1138, 1145, 122, 635, 124, 125, 126, 127, 128, 646, 136, 142, 1167, 1168, 157, 1181, 672, 1199, 175, 689, 201, 209, 1234, 1241, 729, 221, 1249, 739, 231, 1273, 763, 252, 255, 256, 269, 783, 1298, 277, 791, 1311, 289, 292, 805, 810, 812, 302, 814, 820, 310, 826, 315, 320, 833, 1346, 1349, 1351, 330, 340, 346, 1376, 865, 357, 358, 368,

In [135]:
def find_channel_intersection(input_hotel_channels, closest_hotels_channels):
    """
    Finds the intersection of recommended channels between the input hotel and each of its closest hotels.
    
    Parameters:
    - input_hotel_channels: Set of recommended channels for the input hotel
    - closest_hotels_channels: Dictionary with hotel IDs as keys and sets of recommended channels as values
    
    Returns:
    - intersections: Dictionary with hotel IDs as keys and the intersection of recommended channels as values
    """
    intersections = {}
    
    for hotel_id, channels in closest_hotels_channels.items():
        intersection = input_hotel_channels.intersection(channels)
        intersections[hotel_id] = intersection
    
    return intersections

# Get intersections of channel recommendations
channel_intersections = find_channel_intersection(input_hotel_channels, closest_hotels_channels)

# Print the intersections
print("\nChannel Recommendations Intersections:")

for hotel, intersection in channel_intersections.items():
    print(f"Intersection with Hotel {hotel}: {intersection}")



Channel Recommendations Intersections:
Intersection with Hotel 6193: {1024, 1, 533, 1070, 558, 1093, 1102, 594, 1108, 89, 92, 607, 104, 1129, 626, 627, 1145, 635, 124, 125, 127, 128, 646, 136, 142, 1168, 157, 1181, 672, 1199, 175, 201, 209, 1234, 221, 1249, 739, 231, 763, 252, 255, 256, 269, 1298, 277, 791, 1311, 289, 292, 805, 810, 812, 302, 310, 826, 315, 320, 833, 1346, 1349, 1351, 330, 340, 346, 865, 357, 358, 368, 889, 384, 390, 396, 399, 414, 418, 937, 440, 964, 976, 994}
Intersection with Hotel 14099: {1024, 1, 533, 554, 1070, 558, 1093, 71, 76, 1102, 1108, 89, 92, 607, 104, 1129, 626, 627, 1145, 635, 128, 646, 142, 1168, 157, 1181, 677, 1195, 172, 1199, 1200, 1210, 201, 209, 221, 225, 1249, 739, 228, 231, 252, 256, 269, 1298, 277, 791, 1311, 289, 292, 805, 810, 812, 302, 310, 314, 315, 826, 320, 833, 1346, 1349, 1351, 330, 340, 346, 358, 368, 889, 384, 390, 903, 908, 396, 399, 414, 418, 937, 938, 940, 440, 964, 976, 994}
Intersection with Hotel 8998: {1, 558, 1070, 71, 1102, 5

In [136]:
# Print the length of the intersection for each hotel in the dictionary
print("Number of common channel recommendations:")

for hotel_id, intersection in channel_intersections.items():
    print(f"Hotel {hotel_id}: {len(intersection)} common channels")

Number of common channel recommendations:
Hotel 6193: 80 common channels
Hotel 14099: 83 common channels
Hotel 8998: 78 common channels
Hotel 17900: 71 common channels
Hotel 6684: 76 common channels
Hotel 2180: 83 common channels
Hotel 6703: 78 common channels
Hotel 10628: 78 common channels
Hotel 2240: 79 common channels
Hotel 10523: 78 common channels


In [137]:
# U matrix: hotel latent features
hotel_latents = np.array([model.pu[trainset.to_inner_uid(uid)] for uid in df.index if trainset.knows_user(uid)])

# Check variance across the matrix
print("Latent vector variance:", np.var(hotel_latents, axis=0).mean())

Latent vector variance: 0.00012660755141846345


In [138]:
pivot_table

Canal_ID,1,2,6,7,8,10,30,31,32,34,...,1372,1374,1375,1376,1377,1381,1382,1387,1389,1390
Hotel_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20817,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20824,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20825,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [142]:
pivot_table.sum().sum()/(10688*725)

0.03917858249019203

In [None]:
hotel_city_chanel_combin_extract.groupby('Hotel_ID')['Canal_ID'].nunique().sort_values(ascending=True)

In [None]:
merged_data = flattened_data.merge(
    data_lake_prd_314410_cz_hoteis[['Hotel_ID', 'StatusHotel', 'CategoriaHotel', 'Cidade_ID']],
    on='Hotel_ID',
    how='left'
)

merged_data

In [None]:
merged_data.groupby('StatusHotel').size().reset_index(name='Count')


In [None]:
data_lake_prd_314410_cz_cidades

In [None]:
merged_data = merged_data.merge(
    data_lake_prd_314410_cz_cidades[['Cidade_ID', 'Cidade', 'Pais']],
    on='Cidade_ID',
    how='left'
)

merged_data



In [None]:
merged_data.dropna(inplace=True)
merged_data

In [None]:
all_country_flags = hotel_city_chanel_combin_extract.merge(
    data_lake_prd_314410_cz_hoteis[['Hotel_ID', 'Cidade_ID']],
    on='Hotel_ID',
    how='left'
)


In [None]:
all_country_flags = all_country_flags.merge(
    data_lake_prd_314410_cz_cidades[['Cidade_ID', 'Pais']],
    on='Cidade_ID',
    how='left'
)

In [None]:
all_country_flags = all_country_flags[['Canal_ID', 'Pais']].drop_duplicates().sort_values(by='Canal_ID')

In [None]:
all_country_flags

In [None]:
all_country_flags.dropna(inplace=True)

In [None]:
# Step 1: Group all_country_flags by Canal_ID and aggregate the Pais values into sets for fast lookup
channel_country_map = all_country_flags.groupby('Canal_ID')['Pais'].apply(set).to_dict()

In [None]:
# Step 2: Define a function to check the match
def flag_match(row):
    channel_id = row['Channel_ID']
    pais = row['Pais']
    # Check if the channel_id exists and if pais is in the set
    return int(pais in channel_country_map.get(channel_id, set()))

In [None]:
# Step 3: Apply the function to merged_data
merged_data['bookings_same_country'] = merged_data.apply(flag_match, axis=1)

In [None]:
merged_data

In [None]:
channel_country_map.get(1389)

In [None]:
merged_data = merged_data[merged_data['bookings_same_country'] == 1]

In [None]:
similarity_df = compute_hotel_similarity(pivot_table)

# Example: Get the top 10 closest hotels for hotel with ID '2'
hotel_id = 6597  # Example hotel ID
top_n = 25
similar_hotels, similarity_values = get_closest_hotels(hotel_id, similarity_df, top_n, 'bottom')
print("Most similar hotels to hotel", hotel_id, ":", similar_hotels)



def get_channel_recommendations(hotel_id, flattened_data):
    """
    Returns the top 100 recommended channels for a given hotel.
    
    Parameters:
    - hotel_id: The ID of the hotel
    - flattened_data: DataFrame with Hotel_ID, Channel_ID, and Score columns
    
    Returns:
    - recommended_channels: Set of Channel_IDs recommended for the hotel
    """
     # Filter recommendations for the given hotel
    hotel_data = flattened_data[flattened_data['Hotel_ID'] == hotel_id]
    
    # Sort by score in descending order and take top 100
    top_channels = hotel_data.sort_values(by='Score', ascending=False).head(100)
    
    # Convert to set of Channel_IDs
    recommended_channels = set(top_channels['Channel_ID'])
    
    return recommended_channels

In [None]:
# Example: Get channel recommendations for the input hotel and closest hotels
input_hotel = hotel_id  # Example hotel ID

closest_hotels = similar_hotels  # Example closest hotels

input_hotel_channels = get_channel_recommendations(input_hotel, merged_data)

closest_hotels_channels = {
    hotel: get_channel_recommendations(hotel, merged_data) for hotel in closest_hotels
}

# Print the channel recommendations for the input hotel and its closest hotels
print("Input Hotel Channels:", input_hotel_channels)
for hotel, channels in closest_hotels_channels.items():
    print(f"Channels for Hotel {hotel}: {channels}")


In [None]:



def find_channel_intersection(input_hotel_channels, closest_hotels_channels):
    """
    Finds the intersection of recommended channels between the input hotel and each of its closest hotels.
    
    Parameters:
    - input_hotel_channels: Set of recommended channels for the input hotel
    - closest_hotels_channels: Dictionary with hotel IDs as keys and sets of recommended channels as values
    
    Returns:
    - intersections: Dictionary with hotel IDs as keys and the intersection of recommended channels as values
    """
    intersections = {}
    
    for hotel_id, channels in closest_hotels_channels.items():
        intersection = input_hotel_channels.intersection(channels)
        intersections[hotel_id] = intersection
    
    return intersections

# Get intersections of channel recommendations
channel_intersections = find_channel_intersection(input_hotel_channels, closest_hotels_channels)

# Print the intersections
print("\nChannel Recommendations Intersections:")

for hotel, intersection in channel_intersections.items():
    print(f"Intersection with Hotel {hotel}: {intersection}")


In [None]:

# Print the length of the intersection for each hotel in the dictionary
print("Number of common channel recommendations:")

for hotel_id, intersection in channel_intersections.items():
    print(f"Hotel {hotel_id}: {len(intersection)} common channels")