In [58]:
import pandas as pd
import numpy as np  
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr

In [3]:
# Information about individual channels/hotels
data_lake_prd_314410_cz_canais = pd.read_csv('../data/lookups/data-lake-prd-314410.cz.canais.csv')
data_lake_prd_314410_cz_hoteis = pd.read_csv('../data/lookups/data-lake-prd-314410.cz.hoteis.csv')
data_lake_prd_314410_cz_cidades = pd.read_csv('../data/lookups/data-lake-prd-314410.cz.cidades.csv')

# List of hotel-channel combinations as of January 2025
hotel_city_chanel_combin_extract  = pd.read_csv('../data/other/hotel_city_chanel_combin_extract.csv')
hotel_city_chanel_combin_extract.dropna(inplace=True)
hotel_city_chanel_combin_extract.drop(columns=['Cidade_ID'], inplace=True)
hotel_city_chanel_combin_extract.drop_duplicates(inplace=True)

unique_hotels_lookupID = data_lake_prd_314410_cz_hoteis['Hotel_ID'].unique()
unique_chanels_lookupID = data_lake_prd_314410_cz_canais['Canal_ID'].unique()

In [4]:
hotel_city_chanel_combin_extract = hotel_city_chanel_combin_extract.merge(
    data_lake_prd_314410_cz_hoteis[['Hotel_ID', 'StatusHotel']],
    on='Hotel_ID',
    how='left'
)

hotel_city_chanel_combin_extract = hotel_city_chanel_combin_extract[ 
    (hotel_city_chanel_combin_extract['StatusHotel'] == '3. Ativo') | 
    (hotel_city_chanel_combin_extract['StatusHotel'] == '4. Inativo')]

hotel_city_chanel_combin_extract = hotel_city_chanel_combin_extract.drop(columns=['StatusHotel'])

hotel_city_chanel_combin_extract['Canal_ID'] = hotel_city_chanel_combin_extract['Canal_ID'].astype('int64')
hotel_city_chanel_combin_extract['Hotel_ID'] = hotel_city_chanel_combin_extract['Hotel_ID'].astype('int64')

hotel_city_chanel_combin_extract = hotel_city_chanel_combin_extract[
    hotel_city_chanel_combin_extract['Hotel_ID'].isin(unique_hotels_lookupID)
]

hotel_city_chanel_combin_extract = hotel_city_chanel_combin_extract[
    hotel_city_chanel_combin_extract['Canal_ID'].isin(unique_chanels_lookupID)
]


In [5]:
# Pivot the table
pivot_table = hotel_city_chanel_combin_extract.pivot_table(index='Hotel_ID', columns='Canal_ID', aggfunc='size', fill_value=0)

# Convert the table to binary (1 where the combination existed, 0 otherwise)
pivot_table = pivot_table.map(lambda x: 1 if x > 0 else 0)

In [11]:
def compute_channel_scores_excluding_existing(pivot_table):
    """
    Compute recommendation scores for each hotel and channel, excluding channels already used.
    
    Parameters:
    - pivot_table: DataFrame with Hotel_ID as rows, Canal_ID as columns (0/1 values).
    
    Returns:
    - scores_df: DataFrame with the same shape, containing recommendation scores,
                 but zeros where the hotel already used the channel.
    """

    # Step 1: Compute cosine similarity between hotels (rows)
    similarity_matrix = cosine_similarity(pivot_table.values)
    np.fill_diagonal(similarity_matrix, 0)  # remove self-similarity

    # Step 2: Multiply similarity matrix with original matrix to get weighted channel usage
    weighted_scores = np.dot(similarity_matrix, pivot_table.values)

    # Step 3: Create a DataFrame with these scores
    scores_df = pd.DataFrame(weighted_scores, index=pivot_table.index, columns=pivot_table.columns)

    # Step 4: Remove already-used channels (mask with 0 where pivot_table == 1)
    scores_df[pivot_table == 1] = 0

    return scores_df


In [12]:
scores_df = compute_channel_scores_excluding_existing(pivot_table)

In [15]:
# Get top 10 recommended channels for a specific hotel
hotel_id = 7
top_channels = scores_df.loc[hotel_id].sort_values(ascending=False).head(10)
print(top_channels)

Canal_ID
124     64.769288
104     62.926124
908     61.474724
608     60.865617
607     60.536693
1195    60.509170
1145    60.116195
1168    59.174268
833     58.032550
810     56.638884
Name: 7, dtype: float64


In [18]:
# Convert wide format to long format
long_scores_df = scores_df.stack().reset_index()

# Rename columns
long_scores_df.columns = ['hotel_id', 'channel_id', 'score']

# Drop zero scores
long_scores_df = long_scores_df[long_scores_df['score'] > 0].reset_index(drop=True)

In [41]:
def get_top_channels_for_hotel(hotel_id, scores_df, top_n=500):
    # Get non-zero scores sorted descending
    hotel_scores = scores_df.loc[hotel_id]
    top_channels = hotel_scores[hotel_scores > 0].sort_values(ascending=False).head(top_n)
    return set(top_channels.index)


def compare_channel_overlap(hotel_id, similarity_df, scores_df, top_similar=1000, top_channels=500):
    # Get top N similar hotels excluding self
    similar_hotels = (
        similarity_df.loc[hotel_id]
        .drop(hotel_id)
        .sort_values(ascending=False)
        .head(top_similar)
        .index
        .tolist()
    )
    
    # Get the chosen hotel’s top channels
    hotel_channels = get_top_channels_for_hotel(hotel_id, scores_df, top_channels)
    
    overlaps = {}
    for sim_hotel in similar_hotels:
        sim_channels = get_top_channels_for_hotel(sim_hotel, scores_df, top_channels)
        intersection_size = len(hotel_channels.intersection(sim_channels))
        overlaps[sim_hotel] = intersection_size
    
    return overlaps


In [42]:
similarity_matrix = cosine_similarity(pivot_table)
np.fill_diagonal(similarity_matrix, 0) 
similarity_df = pd.DataFrame(similarity_matrix, index=pivot_table.index, columns=pivot_table.index)

In [43]:
hotel_id = 7

overlaps = compare_channel_overlap(hotel_id, similarity_df, scores_df)

print(f"Channel overlap (top 100) between hotel {hotel_id} and each of its top 10 similar hotels:")

for sim_hotel, overlap_count in overlaps.items():
    print(f"Hotel {sim_hotel}: {overlap_count} overlapping channels")


Channel overlap (top 100) between hotel 7 and each of its top 10 similar hotels:
Hotel 1757: 500 overlapping channels
Hotel 6765: 500 overlapping channels
Hotel 2758: 500 overlapping channels
Hotel 1377: 500 overlapping channels
Hotel 15477: 500 overlapping channels
Hotel 563: 500 overlapping channels
Hotel 2296: 500 overlapping channels
Hotel 531: 500 overlapping channels
Hotel 4558: 500 overlapping channels
Hotel 19801: 500 overlapping channels
Hotel 3592: 500 overlapping channels
Hotel 6427: 500 overlapping channels
Hotel 1784: 500 overlapping channels
Hotel 967: 500 overlapping channels
Hotel 17205: 500 overlapping channels
Hotel 8806: 500 overlapping channels
Hotel 77: 500 overlapping channels
Hotel 75: 500 overlapping channels
Hotel 19387: 500 overlapping channels
Hotel 1649: 500 overlapping channels
Hotel 1643: 500 overlapping channels
Hotel 758: 500 overlapping channels
Hotel 5482: 500 overlapping channels
Hotel 18299: 500 overlapping channels
Hotel 11334: 500 overlapping chann

In [48]:
existing_pairs = set(zip(hotel_city_chanel_combin_extract['Hotel_ID'], 
                         hotel_city_chanel_combin_extract['Canal_ID']))

In [None]:
# Define a helper function
def is_existing_channel(row):
    return (row['hotel_id'], row['channel_id']) in existing_pairs

# Filter out existing pairs
filtered_long_scores_df = long_scores_df[~long_scores_df.apply(is_existing_channel, axis=1)].copy()

In [64]:
# Step 1a: Create a matrix of shape (n_hotels, n_channels) from long_scores_df
# Pivot so rows = hotel, cols = channel, values = score
channel_rec_matrix = long_scores_df.pivot(index='hotel_id', columns='channel_id', values='score').fillna(0)

# Step 1b: Compute pairwise cosine similarity between hotels based on channel rec scores
channel_similarity_matrix = cosine_similarity(channel_rec_matrix)


In [65]:
# If hotel_similarity_df is a DataFrame, convert it to numpy array aligned with channel_rec_matrix rows
hotel_ids = channel_rec_matrix.index
hotel_similarity_matrix = similarity_df.loc[hotel_ids, hotel_ids].values

In [66]:
# Step 1c: Flatten the upper triangles of both similarity matrices
def flatten_upper_tri(matrix):
    triu_indices = np.triu_indices_from(matrix, k=1)
    return matrix[triu_indices]

hotel_sim_flat = flatten_upper_tri(hotel_similarity_matrix)
channel_sim_flat = flatten_upper_tri(channel_similarity_matrix)

# Step 1d: Compute correlation between hotel similarity and channel recommendation similarity
corr, pval = spearmanr(hotel_sim_flat, channel_sim_flat)
print(f"Spearman correlation between hotel similarity and channel rec similarity: {corr:.4f} (p-value={pval:.4g})")

Spearman correlation between hotel similarity and channel rec similarity: -0.2300 (p-value=0)


In [67]:
from sklearn.metrics import jaccard_score

top_n = 100

# Step 2a: For each hotel, get the top N recommended channels
top_channels_per_hotel = (
    long_scores_df
    .groupby('hotel_id')
    .apply(lambda df: set(df.nlargest(top_n, 'score')['channel_id']))
)

# Step 2b: Create a binary matrix (hotel x channel), 1 if channel in top-N, else 0
all_channels = sorted(long_scores_df['channel_id'].unique())
hotel_ids = sorted(long_scores_df['hotel_id'].unique())

binary_matrix = pd.DataFrame(0, index=hotel_ids, columns=all_channels, dtype=int)

for hotel_id, channels in top_channels_per_hotel.items():
    binary_matrix.loc[hotel_id, list(channels)] = 1

# Step 2c: Compute pairwise Jaccard similarity between hotels
def pairwise_jaccard(matrix):
    n = matrix.shape[0]
    result = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1, n):
            result[i, j] = jaccard_score(matrix.iloc[i], matrix.iloc[j])
            result[j, i] = result[i, j]
    return result

jaccard_similarity_matrix = pairwise_jaccard(binary_matrix)

# Step 2d: Flatten matrices as before
hotel_sim_flat = flatten_upper_tri(hotel_similarity_matrix)
jaccard_sim_flat = flatten_upper_tri(jaccard_similarity_matrix)

# Step 2e: Correlate hotel similarity and Jaccard similarity of top-N channels
corr_jaccard, pval_jaccard = spearmanr(hotel_sim_flat, jaccard_sim_flat)
print(f"Spearman correlation between hotel similarity and top-{top_n} channel Jaccard similarity: {corr_jaccard:.4f} (p-value={pval_jaccard:.4g})")


  .apply(lambda df: set(df.nlargest(top_n, 'score')['channel_id']))


KeyboardInterrupt: 

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

# Step 3a: Cluster hotels by latent vectors (use hotel_similarity_df as proxy)

# Convert similarity to distance
hotel_distance = 1 - hotel_similarity_matrix

# Use MDS or PCA to embed distance matrix into Euclidean space (optional, here use directly)
# For simplicity, use spectral embedding or PCA here (optional)

# Or cluster directly on the latent vectors if available (you might have them in model)

# Let's assume you have latent vectors in a DataFrame hotel_latent_vectors:
# hotel_latent_vectors = pd.DataFrame(..., index=hotel_ids)

# Example: cluster by hotel latent vectors (if available)
# km_latent = KMeans(n_clusters=10, random_state=42).fit_predict(hotel_latent_vectors)

# If not available, cluster by hotel similarity matrix using spectral clustering:
from sklearn.cluster import SpectralClustering

n_clusters = 10
spectral = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=42)
clusters_hotels = spectral.fit_predict(hotel_similarity_matrix)

# Step 3b: Cluster hotels by their channel recommendations (use channel similarity matrix)
km_channel = KMeans(n_clusters=n_clusters, random_state=42).fit_predict(channel_rec_matrix)

# Step 3c: Compute Adjusted Rand Index between clusters
ari = adjusted_rand_score(clusters_hotels, km_channel)
print(f"Adjusted Rand Index between hotel clusters and channel recommendation clusters: {ari:.4f}")
