<a href="https://colab.research.google.com/github/Mayank7382/Zeotap_Assignment/blob/main/Mayank_Gupta_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd


In [12]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions (1).csv')

In [15]:
customers_preview = customers.head()
products_preview = products.head()
transactions_preview = transactions.head()

customers_preview, products_preview, transactions_preview

(  CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067  2024-04-25 07:38:55    

In [18]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [20]:
transactions_with_products = pd.merge(transactions, products, on="ProductID", how="left")
full_data = pd.merge(transactions_with_products, customers, on="CustomerID", how="left")

In [21]:
full_data_preview = full_data.head()

full_data_preview

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [22]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [23]:
customer_transactions = full_data.groupby("CustomerID").agg({
    "Quantity": "sum",
    "TotalValue": "sum",
    "TransactionID": "count",
    "Category": lambda x: list(x),
}).rename(columns={"TransactionID": "TransactionCount", "Category": "CategoriesPurchased"})



In [25]:
customer_profiles = customers.set_index("CustomerID")[["Region", "SignupDate"]]



In [26]:
customer_data = customer_transactions.join(customer_profiles, how="inner")

In [27]:
encoder = OneHotEncoder()
region_encoded = encoder.fit_transform(customer_data[["Region"]]).toarray()
category_encoded = encoder.fit_transform(customer_data["CategoriesPurchased"].apply(lambda x: [",".join(x)]).tolist()).toarray()

In [28]:
numeric_features = customer_data[["Quantity", "TotalValue", "TransactionCount"]].values
final_features = np.hstack([numeric_features, region_encoded, category_encoded])

In [29]:
similarity_matrix = cosine_similarity(final_features)



In [31]:
top_customers = customers["CustomerID"][:20]
lookalikes = {}

In [33]:
for idx, customer_id in enumerate(top_customers):
    similar_indices = np.argsort(-similarity_matrix[idx])[:4]  # Top 3 (excluding self)
    similar_customers = [(customers["CustomerID"][i], similarity_matrix[idx][i]) for i in similar_indices if i != idx]
    lookalikes[customer_id] = similar_customers[:3]

In [34]:
lookalike_data = [{"cust_id": cust_id, "lookalikes": lookalikes[cust_id]} for cust_id in top_customers]
lookalike_df = pd.DataFrame(lookalike_data)

lookalike_df.head()

Unnamed: 0,cust_id,lookalikes
0,C0001,"[(C0012, 0.9999999184641456), (C0102, 0.999999..."
1,C0002,"[(C0142, 0.9999995831971038), (C0034, 0.999999..."
2,C0003,"[(C0136, 0.9999997769150122), (C0075, 0.999999..."
3,C0004,"[(C0039, 0.9999999492167607), (C0194, 0.999999..."
4,C0005,"[(C0145, 0.9999998092830721), (C0173, 0.999999..."


In [37]:
lookalike_output_file = "Mayank_Gupta_Lookalike.csv"
lookalike_df.to_csv(lookalike_output_file, index=False)

lookalike_output_file

'Mayank_Gupta_Lookalike.csv'