In [5]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [6]:
all_orders_cluster = pd.read_csv('Data/all_orders_cluster.csv')
all_orders_cluster

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,product_name,aisle_id,department_id,department,aisle,cluster
0,2,33120,1,1,202279,Organic Egg Whites,86,16,dairy eggs,eggs,8
1,2,28985,2,1,202279,Michigan Organic Kale,83,4,produce,fresh vegetables,8
2,2,9327,3,0,202279,Garlic Powder,104,13,pantry,spices seasonings,8
3,2,45918,4,1,202279,Coconut Butter,19,13,pantry,oils vinegars,8
4,2,30035,5,0,202279,Natural Sweetener,17,13,pantry,baking ingredients,8
...,...,...,...,...,...,...,...,...,...,...,...
33819101,3421063,14233,3,1,169679,Natural Artesian Water,115,7,beverages,water seltzer sparkling water,10
33819102,3421063,35548,4,1,169679,Twice Baked Potatoes,13,20,deli,prepared meals,10
33819103,3421070,35951,1,1,139822,Organic Unsweetened Almond Milk,91,16,dairy eggs,soy lactosefree,10
33819104,3421070,16953,2,1,139822,Creamy Peanut Butter,88,13,pantry,spreads,10


In [7]:
all_orders_cluster.columns

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id',
       'product_name', 'aisle_id', 'department_id', 'department', 'aisle',
       'cluster'],
      dtype='object')

In [8]:
rows_per_cluster = all_orders_cluster.groupby('cluster').size().reset_index(name='row_count')
rows_per_cluster

Unnamed: 0,cluster,row_count
0,0,8312805
1,1,4040162
2,2,64730
3,3,54729
4,4,886243
5,5,235540
6,6,137795
7,7,33784
8,8,4585003
9,9,804433


In [9]:
selected_clusters = [2, 3, 7, 11, 13]
all_orders_subset = all_orders_cluster[all_orders_cluster['cluster'].isin(selected_clusters)]
print("Subset Shape:", all_orders_subset.shape)
print("Unique Clusters in Subset:", all_orders_subset['cluster'].unique())

Subset Shape: (312741, 11)
Unique Clusters in Subset: [13  3 11  7  2]


In [6]:
# Use it in api logic
all_orders_subset.to_csv('Data/all_orders_subset.csv', index=False)  

In [10]:
all_orders_subset = pd.read_csv('Data/all_orders_subset.csv')
all_orders_subset

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,product_name,aisle_id,department_id,department,aisle,cluster
0,25,9755,1,1,59897,Original Popcorn,23,19,snacks,popcorn jerky,13
1,25,31487,2,0,59897,Boomchickapop Sweet & Salty Kettle Corn,23,19,snacks,popcorn jerky,13
2,25,37510,3,1,59897,Steamfresh Lightly Sauced Broccoli With Cheese...,116,1,frozen,frozen produce,13
3,25,14576,4,1,59897,Delights Turkey Sausage Egg Whites & Cheese En...,52,1,frozen,frozen breakfast,13
4,25,22105,5,0,59897,Ultra Thin Sliced Provolone Cheese,21,16,dairy eggs,packaged cheese,13
...,...,...,...,...,...,...,...,...,...,...,...
312736,3415184,39657,2,0,127645,Milk Chocolate Almonds,45,19,snacks,candy chocolate,7
312737,3418551,18899,1,0,99764,Verry Special Cognac,124,5,alcohol,spirits,3
312738,3418551,13718,2,0,99764,Grape with Electrolytes & Vitamins Water Beverage,64,7,beverages,energy sports drinks,3
312739,3418551,29255,3,0,99764,Berry Workout Water,64,7,beverages,energy sports drinks,3


In [11]:
# Extract unique product_id and product_name mapping from all_orders_cluster to allow the system to display product names instead of IDs in the recommendations.
product_id_to_name = all_orders_cluster[['product_id', 'product_name']].drop_duplicates().set_index('product_id')['product_name']

In [12]:
# Create the user-item matrix (Rows represent users. Columns represent products.Values represent the number of times a user has purchased a product)
unique_users = all_orders_subset['user_id'].unique()
unique_products = all_orders_subset['product_id'].unique()

In [13]:
# Create mappings from user_id and product_id to integer indices
user_id_to_index = {user_id: idx for idx, user_id in enumerate(unique_users)}
product_id_to_index = {product_id: idx for idx, product_id in enumerate(unique_products)}

# Map user_id and product_id to their respective indices
row_indices = all_orders_subset['user_id'].map(user_id_to_index)
col_indices = all_orders_subset['product_id'].map(product_id_to_index)

In [14]:
# Create the sparse user-item matrix where Rows represent users,Columns represent products, Values represent whether a user has reordered a product (1 for reordered, 0 otherwise).
# This matrix is essential for collaborative filtering, as it captures user-product interactions.
user_item_matrix_sparse = csr_matrix(
    (all_orders_subset['reordered'], (row_indices, col_indices)),
    shape=(len(unique_users), len(unique_products))
)

# Convert to a DataFrame for easier manipulation 
user_item_matrix = pd.DataFrame.sparse.from_spmatrix(
    user_item_matrix_sparse,
    index=unique_users,  # User IDs as rows
    columns=unique_products  # Product IDs as columns
).fillna(0)

print("User-Item Matrix Shape:", user_item_matrix.shape)

User-Item Matrix Shape: (3804, 25570)


In [15]:
user_id = 127645  
user_rows = all_orders_subset[all_orders_subset['user_id'] == user_id]
user_rows

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,product_name,aisle_id,department_id,department,aisle,cluster
26248,303045,46149,1,1,127645,Zero Calorie Cola,77,7,beverages,soft drinks,7
26249,303045,196,2,1,127645,Soda,77,7,beverages,soft drinks,7
27216,312382,37710,1,0,127645,Trail Mix,125,19,snacks,trail mix snack mix,7
27217,312382,43721,2,0,127645,Wint-O-Green,46,19,snacks,mint gum,7
116644,1353057,31474,1,0,127645,Dried Tart Cherries,117,19,snacks,nuts seeds dried fruit,7
116645,1353057,21573,2,0,127645,Roasted Pine Nut Hummus,67,20,deli,fresh dips tapenades,7
116646,1353057,46149,3,0,127645,Zero Calorie Cola,77,7,beverages,soft drinks,7
116647,1353057,196,4,1,127645,Soda,77,7,beverages,soft drinks,7
144238,1679974,16974,1,0,127645,Sea Salt Brown Rice Crackers,107,19,snacks,chips pretzels,7
144239,1679974,46061,2,0,127645,Popcorn,23,19,snacks,popcorn jerky,7


In [18]:
def recommend_products_hybrid(user_id, product_names, all_orders_subset, user_item_matrix_sparse, product_id_to_name, user_id_to_index, top_n=5):
    """
    Recommends products for a given user using a hybrid approach.
    """
    # Step 1: Map product names to product IDs
    product_ids = []
    for name in product_names:
        product_id = all_orders_subset[all_orders_subset['product_name'] == name]['product_id'].values
        if len(product_id) > 0:
            product_ids.append(product_id[0])
        else:
            print(f"Product '{name}' not found in the dataset.")
    
    if not product_ids:
        print("No valid products found. Please check your input.")
        return []
    
    # Step 2: Check if the user exists in the subset
    if user_id not in user_id_to_index:
        print(f"User ID {user_id} not found in the subset.")
        return []
    
    # Step 3: Get the cluster of the target user
    target_user_cluster = all_orders_subset[all_orders_subset['user_id'] == user_id]['cluster'].values[0]
    print(f"Target user belongs to cluster: {target_user_cluster}")
    
    # Step 4: Filter the dataset to include only users in the same cluster
    cluster_users = all_orders_subset[all_orders_subset['cluster'] == target_user_cluster]['user_id'].unique()
    
    # Filter the user-item matrix to include only users in the same cluster
    cluster_user_indices = [user_id_to_index[user] for user in cluster_users if user in user_id_to_index]
    cluster_user_item_matrix = user_item_matrix_sparse[cluster_user_indices]
    
    # Step 5: Find similar users within the cluster using KNN
    knn = NearestNeighbors(n_neighbors=50, metric='cosine', algorithm='brute')  # Increased n_neighbors
    knn.fit(cluster_user_item_matrix)
    
    # Find similar users for the target user
    user_index = user_id_to_index[user_id]
    distances, indices = knn.kneighbors(user_item_matrix_sparse[user_index])
    similar_users = [cluster_users[idx] for idx in indices[0][1:]]  # Exclude the user itself
    
    # Step 6: Get products purchased by similar users but not by the target user
    similar_user_indices = [user_id_to_index[user] for user in similar_users]
    similar_user_products = user_item_matrix_sparse[similar_user_indices].sum(axis=0)  # Sum across similar users
    target_user_products = user_item_matrix_sparse[user_index].nonzero()[1]  # Products purchased by the target user
    
    # Find new products
    new_products = set(similar_user_products.nonzero()[1]) - set(target_user_products)
    print(f"Number of new products: {len(new_products)}")  # Debugging
    
    # Step 7: Get top N recommended products
    recommended_products = list(new_products)[:top_n]
    
    # Filter out invalid product IDs , Products purchased by similar users but not by the target user are identified as recommendations,
    #  If insufficient new products are found, the system falls back to recommending popular products within the cluster.
    valid_recommended_products = [product_id for product_id in recommended_products if product_id in product_id_to_name]
    print(f"Number of valid recommended products: {len(valid_recommended_products)}")  # Debugging
    
    # Fallback: Recommend popular products in the cluster if new products are insufficient
    if len(valid_recommended_products) < top_n:
        print("Insufficient new products. Falling back to popular products in the cluster.")
        cluster_products = all_orders_subset[all_orders_subset['cluster'] == target_user_cluster]['product_id'].value_counts().index.tolist()
        popular_products = [product_id for product_id in cluster_products if product_id in product_id_to_name]
        valid_recommended_products.extend(popular_products[:top_n - len(valid_recommended_products)])
    
    # Map product IDs to product names
    recommended_product_names = [product_id_to_name[product_id] for product_id in valid_recommended_products]
    
    return recommended_product_names

In [19]:
user_id = 127645  
user_products = [
    "Wint-O-Green",
    "Sea Salt Brown Rice Crackers",
    "Soda",
    "Popcorn"
]

# Clean up product names (remove leading/trailing spaces)
user_products = [name.strip() for name in user_products]
top_n = 7 

recommended_products = recommend_products_hybrid(
    user_id=user_id,
    product_names=user_products,
    all_orders_subset=all_orders_subset,
    user_item_matrix_sparse=user_item_matrix_sparse,
    product_id_to_name=product_id_to_name,
    user_id_to_index=user_id_to_index,
    top_n=top_n  
)

# Display recommendations
if recommended_products:
    print(f"Recommended products (Top {top_n}):")
    for product in recommended_products:
        print(f"- {product}")
else:
    print("No recommendations available.")

Target user belongs to cluster: 7
Number of new products: 159
Number of valid recommended products: 6
Insufficient new products. Falling back to popular products in the cluster.
Recommended products (Top 7):
- Fresh Scent Dishwasher Cleaner
- Overnight Diapers Size 6
- Vegetarian Soup Mix
- Red Velvet Creme Cake
- Grape Soda
- Medium Square Containers & Lids
- Soda
