In [1]:
import os
import glob
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
main_dir = 'data/processed/'
file_name = 'cluster_*.csv'
file_path = os.path.join(main_dir, file_name)

In [3]:
csv_files = glob.glob(file_path)

In [4]:
dataframes = [pd.read_csv(file, index_col='Customer ID') for file in csv_files]

# Recommendations

In [5]:
# store the cosine similarity DataFrames for each DataFrame in the list
cosine_sim_dfs = {}

In [6]:
df_features = ['Age', 'Total Spend', 'Items Purchased', 'Average Rating',
       'Discount Applied', 'Days Since Last Purchase', 'Satisfaction Level']

In [7]:
# List of custom dataframe names
dataframe_names = ['houston_df', 'san_francisco_df', 'los_angeles_df', 'new_york_df', 'chicago_df', 'miami_df']

In [8]:
for i, (df, name) in enumerate(zip(dataframes, dataframe_names)):    
    # relevant features
    df_selected = df[df_features]
    
    # cosine similarity
    cosine_sim = cosine_similarity(df_selected)
    
    # Convert similarity matrix to a DataFrame
    cosine_sim_df = pd.DataFrame(cosine_sim, index=df.index, columns=df.index)
    
    # Store the cosine similarity DataFrame with the custom name
    cosine_sim_dfs[name] = cosine_sim_df

In [9]:
cosine_sim_dfs['houston_df'].head()

Customer ID,106,112,118,124,130,136,142,148,154,160,...,393,399,405,411,417,423,429,435,441,447
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
106,1.0,0.989287,0.996221,0.970983,0.998062,0.997887,0.994077,0.99275,0.970983,0.998062,...,0.964883,0.995582,0.962225,0.996426,0.95945,0.987328,0.956079,0.983623,0.95269,0.983889
112,0.989287,1.0,0.979097,0.989243,0.989621,0.993188,0.993315,0.977898,0.989243,0.989621,...,0.980399,0.978502,0.975931,0.98374,0.973478,0.968388,0.968342,0.968988,0.965275,0.964541
118,0.996221,0.979097,1.0,0.952845,0.991694,0.99029,0.98998,0.991688,0.952845,0.991694,...,0.952597,0.9991,0.950818,0.995693,0.949299,0.995011,0.946773,0.98714,0.944592,0.992542
124,0.970983,0.989243,0.952845,1.0,0.968201,0.983619,0.975385,0.943864,1.0,0.968201,...,0.993361,0.949714,0.990982,0.969882,0.987774,0.94473,0.984653,0.959513,0.980821,0.942125
130,0.998062,0.989621,0.991694,0.968201,1.0,0.995346,0.991342,0.995921,0.968201,1.0,...,0.957082,0.992964,0.952565,0.990442,0.949505,0.979424,0.944349,0.972675,0.940715,0.974754


In [10]:
def get_recommendations(dataframe_key, user_id, top_n=3):
    # similarity matrix
    sim_df = cosine_sim_dfs[dataframe_key]
    
    # similarity scores for the given user
    sim_scores = sim_df[user_id]
    
    # Sort users based on similarity score
    similar_users = sim_scores.sort_values(ascending=False)[1:top_n+1]
    
    return similar_users

In [11]:
print(get_recommendations('houston_df', user_id=106, top_n=3))

Customer ID
363    0.999852
387    0.998250
292    0.998062
Name: 106, dtype: float64
