In [4]:
# Milestone 2: Model Building
# AI-Enabled Recommendation System

# Import required libraries
import pandas as pd
import numpy as np

# Libraries for similarity computation and normalization
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

In [5]:
# Step 1: Load the Cleaned Dataset

# Load preprocessed e-commerce interaction data
df = pd.read_csv("clean_ecommerce_data.csv")

# Inspect dataset structure and verify columns
print(df.head())
print(df.info())

    User_ID  Product_ID  Category  Final_Price(Rs.)  interaction
0  337c166f  f414122f-e    sports             31.05            1
1  d38a19bf  fde50f9c-5  clothing            186.23            1
2  d7f5f0b0  0d96fc90-3    sports            237.76            1
3  395d4994  964fc44b-d      toys            129.89            1
4  a83c145c  d70e2fc6-e    beauty            195.84            1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3658 entries, 0 to 3657
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   User_ID           3658 non-null   object 
 1   Product_ID        3658 non-null   object 
 2   Category          3658 non-null   object 
 3   Final_Price(Rs.)  3658 non-null   float64
 4   interaction       3658 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 143.0+ KB
None


In [6]:
# Step 2: Create Implicit Feedback Interactions

# Since explicit ratings are unavailable,
# each user interaction is treated as implicit feedback
df["interaction"] = 1

In [7]:
# Step 3: Construct User–Item (User–Category) Matrix

# Create a user–item matrix at the category level because of unique products(no common products)
user_item_matrix = pd.pivot_table(
    df,
    index="User_ID",          # Users as rows
    columns="Category",       # Product categories as items
    values="interaction",
    aggfunc="sum",
    fill_value=0
)

# Display matrix dimensions and sample rows
print("User–Item Matrix Shape:", user_item_matrix.shape)
print(user_item_matrix.head())

User–Item Matrix Shape: (3658, 7)
Category  beauty  books  clothing  electronics  home & kitchen  sports  toys
User_ID                                                                     
001ae40d       0      0         0            0               0       1     0
0021e458       0      0         0            0               0       1     0
00333c7a       0      0         0            0               0       0     1
00390cf5       0      0         0            0               0       1     0
0041710c       0      1         0            0               0       0     0


In [8]:
# Step 4: Normalize User–Item Matrix

# Normalize interaction vectors to prepare for cosine similarity
user_item_normalized = normalize(user_item_matrix)

In [9]:
# Step 5: Train Collaborative Filtering Model

# Compute user–user similarity using cosine similarity
user_similarity = cosine_similarity(user_item_normalized)

# Convert similarity matrix to DataFrame for readability
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_item_matrix.index,
    columns=user_item_matrix.index
)

# Display sample of similarity matrix
print(user_similarity_df.head())

User_ID   001ae40d  0021e458  00333c7a  00390cf5  0041710c  0049cde2  \
User_ID                                                                
001ae40d       1.0       1.0       0.0       1.0       0.0       0.0   
0021e458       1.0       1.0       0.0       1.0       0.0       0.0   
00333c7a       0.0       0.0       1.0       0.0       0.0       0.0   
00390cf5       1.0       1.0       0.0       1.0       0.0       0.0   
0041710c       0.0       0.0       0.0       0.0       1.0       0.0   

User_ID   005258a0  007d82ed  00812bf1  008384ad  ...  ff612d9d  ff91160c  \
User_ID                                           ...                       
001ae40d       0.0       0.0       0.0       0.0  ...       0.0       1.0   
0021e458       0.0       0.0       0.0       0.0  ...       0.0       1.0   
00333c7a       0.0       0.0       0.0       1.0  ...       1.0       0.0   
00390cf5       0.0       0.0       0.0       0.0  ...       0.0       1.0   
0041710c       0.0       0.0     

In [10]:
# Step 6: Recommendation Function

def recommend_categories(user_id, top_n=3):
    # Check if user exists in the dataset
    if user_id not in user_item_matrix.index:
        return "User not found"

    # Get similarity scores
    sim_scores = user_similarity_df[user_id].sort_values(ascending=False)

    # Remove self-similarity
    sim_scores = sim_scores.drop(user_id)

    # Get top similar users
    top_users = sim_scores.head(5).index

    # Aggregate category interactions of similar users
    recommended_scores = user_item_matrix.loc[top_users].sum()

    # Exclude categories already interacted with by the user
    user_categories = user_item_matrix.loc[user_id]
    recommended_scores = recommended_scores[user_categories == 0]

    # Fallback strategy: recommend globally popular categories
    # if no neighbor-based recommendations are found
    if recommended_scores.sum() == 0:
        popular_categories = user_item_matrix.sum().sort_values(ascending=False)
        recommended_scores = popular_categories[user_categories == 0]

    # Return top-N recommended categories
    return recommended_scores.sort_values(ascending=False).head(top_n)

In [13]:
# Step 7: Test Recommendations

# Select a sample user
sample_user = user_item_matrix.index[25]

# Generate recommendations for the sample user
print("Recommendations for User:", sample_user)
print(recommend_categories(sample_user))

Recommendations for User: 0195ea56
Category
home & kitchen    549
books             533
toys              522
dtype: int64


In [14]:
# Step 8: Evaluate Model – Matrix Sparsity

# Calculate sparsity to measure proportion of missing interactions

sparsity = 1 - (np.count_nonzero(user_item_matrix) / user_item_matrix.size)
print("Matrix Sparsity:", round(sparsity, 4))

Matrix Sparsity: 0.8571


In [15]:
# Step 9: Evaluate Model – Recommendation Coverage

def recommendation_coverage():
    recommended_categories = set()

    for user in user_item_matrix.index[:100]:
        recs = recommend_categories(user)
        if isinstance(recs, pd.Series):
            recommended_categories.update(recs.index)

    total_categories = set(user_item_matrix.columns)
    return len(recommended_categories) / len(total_categories)

print("Recommendation Coverage:", round(recommendation_coverage(), 4))

Recommendation Coverage: 0.5714


In [16]:
# Step 10: Evaluate Model – Average User Similarity

# Compute average similarity across all user pairs

avg_similarity = user_similarity_df.values.mean()
print("Average User Similarity:", round(avg_similarity, 4))

Average User Similarity: 0.143
