In [1]:
import os
import glob
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
main_dir = 'data/processed/'
file_name = 'cluster_*.csv'
file_path = os.path.join(main_dir, file_name)

In [3]:
csv_files = glob.glob(file_path)

In [4]:
dataframes = [pd.read_csv(file, index_col='Customer ID') for file in csv_files]

# Recommendations

In [5]:
# store the cosine similarity DataFrames for each DataFrame in the list
cosine_sim_dfs = {}

In [6]:
df_features = ['Age', 'Total Spend', 'Items Purchased', 'Average Rating',
       'Discount Applied', 'Days Since Last Purchase', 'Satisfaction Level']

In [7]:
# List of custom dataframe names
dataframe_names = ['houston_df', 'san_francisco_df', 'los_angeles_df', 'new_york_df', 'chicago_df', 'miami_df']

In [8]:
for i, (df, name) in enumerate(zip(dataframes, dataframe_names)):    
    # relevant features
    df_selected = df[df_features]
    
    # cosine similarity
    cosine_sim = cosine_similarity(df_selected)
    
    # Convert similarity matrix to a DataFrame
    cosine_sim_df = pd.DataFrame(cosine_sim, index=df.index, columns=df.index)
    
    # Store the cosine similarity DataFrame with the custom name
    cosine_sim_dfs[name] = cosine_sim_df

In [13]:
cosine_sim_dfs['houston_df'].head()

Customer ID,102,108,114,120,126,132,138,144,150,156,...,395,401,407,413,419,425,431,437,443,449
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
102,1.0,0.994434,0.933122,0.998069,0.992366,0.927146,0.999586,0.990228,0.997043,0.992366,...,0.996687,0.997851,0.997411,0.999786,0.997212,0.999058,0.999301,0.999788,0.996924,0.997774
108,0.994434,1.0,0.937373,0.996513,0.999054,0.933784,0.995696,0.998485,0.997539,0.999054,...,0.997174,0.990966,0.998787,0.996175,0.994047,0.993427,0.993914,0.992294,0.991954,0.986619
114,0.933122,0.937373,1.0,0.929917,0.940153,0.996808,0.934083,0.93939,0.930563,0.940153,...,0.931387,0.938819,0.940655,0.934467,0.929015,0.936405,0.937028,0.931399,0.927333,0.930781
120,0.998069,0.996513,0.929917,1.0,0.993495,0.928153,0.998945,0.992248,0.999764,0.993495,...,0.998611,0.994836,0.996816,0.998591,0.997733,0.995252,0.996142,0.997111,0.99675,0.991884
126,0.992366,0.999054,0.940153,0.993495,1.0,0.934136,0.994007,0.999793,0.994992,1.0,...,0.993319,0.990121,0.998486,0.994562,0.989364,0.993076,0.992037,0.989776,0.986864,0.984962


In [10]:
def get_recommendations(dataframe_key, user_id, top_n=3):
    # similarity matrix
    sim_df = cosine_sim_dfs[dataframe_key]
    
    # similarity scores for the given user
    sim_scores = sim_df[user_id]
    
    # Sort users based on similarity score
    similar_users = sim_scores.sort_values(ascending=False)[1:top_n+1]
    
    return similar_users

In [11]:
print(get_recommendations('houston_df', user_id=106, top_n=3))

Customer ID
363    0.999852
387    0.998250
292    0.998062
Name: 106, dtype: float64


# Save recommendation matrices

In [14]:
# Directory path where files will be saved
save_dir = 'data/processed/'

In [15]:
for name, cosine_sim_df in cosine_sim_dfs.items():
    file_path = os.path.join(save_dir, f'cosine_similarity_{name}.csv')
    cosine_sim_df.to_csv(file_path)
    print(f"File saved successfully at {file_path}.")

File saved successfully at data/processed/houston_df_cosine_similarity.csv.
File saved successfully at data/processed/san_francisco_df_cosine_similarity.csv.
File saved successfully at data/processed/los_angeles_df_cosine_similarity.csv.
File saved successfully at data/processed/new_york_df_cosine_similarity.csv.
File saved successfully at data/processed/chicago_df_cosine_similarity.csv.
File saved successfully at data/processed/miami_df_cosine_similarity.csv.
