In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json

In [2]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
merged_df = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")


In [4]:
customer_profiles = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',           # Total spending
    'Quantity': 'sum',             # Total quantity purchased
    'ProductID': 'nunique',        # Number of unique products purchased
    'Category': lambda x: x.mode()[0] if not x.mode().empty else None,  # Most purchased category
    'Region': 'first',             # Region (from customer data)
}).reset_index()

In [5]:
numeric_features = ['TotalValue', 'Quantity', 'ProductID']
categorical_features = ['Region', 'Category']

In [6]:
transformer = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

In [7]:
customer_features = transformer.fit_transform(customer_profiles)

In [8]:
similarity_matrix = cosine_similarity(customer_features)

In [9]:
lookalike_map = {}
for idx, customer_id in enumerate(customer_profiles['CustomerID']):
    # Get the indices of the top 3 most similar customers (excluding the customer themselves)
    similar_indices = similarity_matrix[idx].argsort()[::-1][1:4]
    lookalike_map[customer_id] = [
        (customer_profiles.iloc[i]['CustomerID'], similarity_matrix[idx][i])
        for i in similar_indices
    ]

In [10]:
lookalike_df = pd.DataFrame([
    {
        'cust_id': cust_id,
        'lookalikes': json.dumps([
            {'customer_id': lookalike[0], 'score': round(lookalike[1], 2)}
            for lookalike in lookalikes
        ])
    }
    for cust_id, lookalikes in lookalike_map.items()
])


In [13]:
lookalike_df.to_csv('Mirdul_Agrawal_Looklike.csv', index=False)

In [12]:
lookalike_df.head(20)

Unnamed: 0,cust_id,lookalikes
0,C0001,"[{""customer_id"": ""C0048"", ""score"": 0.98}, {""cu..."
1,C0002,"[{""customer_id"": ""C0088"", ""score"": 0.96}, {""cu..."
2,C0003,"[{""customer_id"": ""C0031"", ""score"": 0.9}, {""cus..."
3,C0004,"[{""customer_id"": ""C0087"", ""score"": 0.96}, {""cu..."
4,C0005,"[{""customer_id"": ""C0186"", ""score"": 1.0}, {""cus..."
5,C0006,"[{""customer_id"": ""C0168"", ""score"": 0.97}, {""cu..."
6,C0007,"[{""customer_id"": ""C0005"", ""score"": 0.99}, {""cu..."
7,C0008,"[{""customer_id"": ""C0065"", ""score"": 0.88}, {""cu..."
8,C0009,"[{""customer_id"": ""C0198"", ""score"": 0.99}, {""cu..."
9,C0010,"[{""customer_id"": ""C0111"", ""score"": 0.89}, {""cu..."
