Age
Total Spend
Items Purchased
Average Rating
Discount Applied
Days Since Last Purchase
Satisfaction Level
City
Gender
Membership Type

In [16]:
import os
import glob
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
main_dir = 'data/processed/'
file_name = 'cluster_*.csv'
file_path = os.path.join(main_dir, file_name)

In [26]:
# Use glob to find files starting with 'cluster' and ending with '.csv'
csv_files = glob.glob(file_path)

In [27]:
# Read each CSV into a DataFrame and store them in a list
dataframes = [pd.read_csv(file) for file in csv_files]

# Recommendations

In [31]:
# Dictionary to store the cosine similarity DataFrames for each DataFrame in the list
cosine_sim_dfs = {}

In [46]:
df_features = ['Age', 'Total Spend', 'Items Purchased', 'Average Rating',
       'Discount Applied', 'Days Since Last Purchase', 'Satisfaction Level']

In [49]:
# Process each DataFrame in the list
for i, df in enumerate(dataframes):
    # Step 1: Check if all required features exist in the DataFrame
    missing_features = [feature for feature in df_features if feature not in df.columns]
    if missing_features:
        raise ValueError(f"DataFrame {i+1} is missing the following features: {missing_features}")
    
    # Step 2: Select only the specified features
    df_selected = df[df_features]
    
    # Step 3: Compute cosine similarity
    cosine_sim = cosine_similarity(df_selected)
    
    # Step 4: Convert similarity matrix into a DataFrame
    cosine_sim_df = pd.DataFrame(cosine_sim, index=df.index, columns=df.index)
    
    # Step 5: Store the similarity DataFrame with a unique key
    cosine_sim_dfs[f'df_{i+1}'] = cosine_sim_df

In [50]:
# Show similarity DataFrame for the first DataFrame (example)
cosine_sim_dfs['df_1'].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,1.0,0.989287,0.996221,0.970983,0.998062,0.997887,0.994077,0.99275,0.970983,0.998062,...,0.964883,0.995582,0.962225,0.996426,0.95945,0.987328,0.956079,0.983623,0.95269,0.983889
1,0.989287,1.0,0.979097,0.989243,0.989621,0.993188,0.993315,0.977898,0.989243,0.989621,...,0.980399,0.978502,0.975931,0.98374,0.973478,0.968388,0.968342,0.968988,0.965275,0.964541
2,0.996221,0.979097,1.0,0.952845,0.991694,0.99029,0.98998,0.991688,0.952845,0.991694,...,0.952597,0.9991,0.950818,0.995693,0.949299,0.995011,0.946773,0.98714,0.944592,0.992542
3,0.970983,0.989243,0.952845,1.0,0.968201,0.983619,0.975385,0.943864,1.0,0.968201,...,0.993361,0.949714,0.990982,0.969882,0.987774,0.94473,0.984653,0.959513,0.980821,0.942125
4,0.998062,0.989621,0.991694,0.968201,1.0,0.995346,0.991342,0.995921,0.968201,1.0,...,0.957082,0.992964,0.952565,0.990442,0.949505,0.979424,0.944349,0.972675,0.940715,0.974754


In [51]:
# Define a recommendation function to work for a specific DataFrame
def get_recommendations(dataframe_key, user_id, top_n=3):
    # Retrieve the appropriate similarity matrix
    sim_df = cosine_sim_dfs[dataframe_key]
    
    # Get similarity scores for the given user
    sim_scores = sim_df[user_id]
    
    # Sort users based on similarity score (highest to lowest), skipping the user themselves
    similar_users = sim_scores.sort_values(ascending=False)[1:top_n+1]
    
    return similar_users

In [57]:
print(get_recommendations('df_1', user_id=0, top_n=3))

43    0.999852
47    0.998250
31    0.998062
Name: 0, dtype: float64
