## Task 2 : Lookalike Model

In [1]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
customers = pd.read_csv(r"C:\Users\mrunmai k\OneDrive\Desktop\DataScienceInternAssign\files\Customers.csv")
products = pd.read_csv(r"C:\Users\mrunmai k\OneDrive\Desktop\DataScienceInternAssign\files\Products.csv")
transactions = pd.read_csv(r"C:\Users\mrunmai k\OneDrive\Desktop\DataScienceInternAssign\files\Transactions.csv")


In [6]:
# Merge datasets
merged_data = pd.merge(transactions, customers, on="CustomerID", how="left")
merged_data = pd.merge(merged_data, products, on="ProductID", how="left")


In [7]:
# Feature Engineering : Total spending per customer
customer_spending = merged_data.groupby("CustomerID")["TotalValue"].sum().reset_index()
customer_spending.rename(columns={"TotalValue": "TotalSpending"}, inplace=True)

In [8]:
# 2. Most frequent product category for each customer
most_frequent_category = merged_data.groupby("CustomerID")["Category"].agg(lambda x: x.mode()[0]).reset_index()
most_frequent_category.rename(columns={"Category": "FrequentCategory"}, inplace=True)


In [9]:
# 3. Add region information
customer_region = customers[["CustomerID", "Region"]]

In [10]:
# Combine features
customer_profile = pd.merge(customer_spending, most_frequent_category, on="CustomerID")
customer_profile = pd.merge(customer_profile, customer_region, on="CustomerID")


In [11]:
# Encode categorical features (Region, FrequentCategory)
customer_profile = pd.get_dummies(customer_profile, columns=["Region", "FrequentCategory"], drop_first=True)


In [12]:
# Normalize numerical features 
scaler = MinMaxScaler()
customer_profile_scaled = customer_profile.copy()
customer_profile_scaled[["TotalSpending"]] = scaler.fit_transform(customer_profile[["TotalSpending"]])


In [13]:
# Similarity Computation
customer_ids = customer_profile_scaled["CustomerID"]
feature_matrix = customer_profile_scaled.drop("CustomerID", axis=1)

similarity_matrix = cosine_similarity(feature_matrix)


In [15]:
# Recommend Top 3 Lookalikes for Each Customer
lookalike_map = {}
for idx, customer_id in enumerate(customer_ids):
    # Get similarity scores for the current customer
    scores = list(enumerate(similarity_matrix[idx]))
    # Sort by similarity score in descending order, excluding the customer itself
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]
    # Map customer_id to the top 3 similar customers with scores
    lookalike_map[customer_id] = [(customer_ids[i], round(score, 3)) for i, score in scores]


In [16]:
# Filter for Customers C0001 to C0020
filtered_lookalikes = {k: v for k, v in lookalike_map.items() if k in customers["CustomerID"][:20].values}


In [18]:
# Save results to Lookalike.csv
lookalike_df = pd.DataFrame({
    "CustomerID": filtered_lookalikes.keys(),
    "Lookalikes": [v for v in filtered_lookalikes.values()]
})
lookalike_df.to_csv("Mrunmai_Kashaype_Lookalike.csv", index=False)

print("Lookalike model results saved to 'Lookalike.csv'.")

Lookalike model results saved to 'Lookalike.csv'.
