<h1 style="text-align:center; color:#daa520">Lookalike Model</h1>

## 1. Data Preparation

In [69]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import os

### Create data directory if it doesn't exist

In [70]:
if not os.path.exists('data'):
    os.makedirs('data')

### Load the datasets

In [71]:
try:
    customers_df = pd.read_csv('data/Customers.csv')
    products_df = pd.read_csv('data/Products.csv')
    transactions_df = pd.read_csv('data/Transactions.csv')
    print("Data loaded successfully!")
except FileNotFoundError:
    print("Error: Ensure that the CSV files are in the 'data' folder.")
    exit()

Data loaded successfully!


### Convert date columns to datetime objects

In [72]:
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

### Merge transactions with products

In [73]:
transactions_df = pd.merge(transactions_df, products_df, on='ProductID', how='left')

### Merge transactions with customers

In [74]:
transactions_df = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')

In [75]:
print("Merged Dataframe Head:")
transactions_df.head()

Merged Dataframe Head:


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


## 2. Feature Engineering

### One-Hot Encode customer region

In [76]:
transactions_df = pd.get_dummies(transactions_df, columns=['Region'], prefix='Region')
transactions_df

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,SignupDate,Region_Asia,Region_Europe,Region_North America,Region_South America
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,2022-12-03,False,True,False,False
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,2024-09-04,True,False,False,False
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,2024-04-04,False,True,False,False
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,2024-04-11,False,False,False,True
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,2022-03-15,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,SoundWave Smartwatch,Electronics,459.86,Jacob Holt,2022-01-22,False,False,False,True
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,SoundWave Smartwatch,Electronics,459.86,Mrs. Kimberly Wright,2024-04-07,False,False,True,False
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,SoundWave Smartwatch,Electronics,459.86,Tyler Haynes,2024-09-21,False,False,True,False
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,SoundWave Smartwatch,Electronics,459.86,Joshua Hamilton,2024-11-11,True,False,False,False


### Aggregate product categories for each customer

In [77]:
customer_categories = transactions_df.groupby('CustomerID')['Category'].apply(lambda x: ','.join(set(x))).reset_index()
customer_categories = pd.concat([customer_categories['CustomerID'], customer_categories['Category'].str.get_dummies(sep=',')], axis=1)


### Aggregate total transaction value for each customer

In [78]:
customer_total_value = transactions_df.groupby('CustomerID')['TotalValue'].sum().reset_index()

### Calculate days since signup

In [79]:
today = pd.to_datetime('today')
customers_df['DaysSinceSignup'] = (today - customers_df['SignupDate']).dt.days

### Merge features into a single dataframe

In [80]:
customer_features = pd.merge(customers_df[['CustomerID', 'DaysSinceSignup']], customer_categories, on='CustomerID', how='left')
customer_features = pd.merge(customer_features, customer_total_value, on='CustomerID', how='left')
customer_features = customer_features.fillna(0)

print("\nCustomer Features Head:")
customer_features.head()


Customer Features Head:


Unnamed: 0,CustomerID,DaysSinceSignup,Books,Clothing,Electronics,Home Decor,TotalValue
0,C0001,930,1.0,0.0,1.0,1.0,3354.52
1,C0002,1077,0.0,1.0,0.0,1.0,1862.74
2,C0003,324,0.0,1.0,1.0,1.0,2725.38
3,C0004,839,1.0,0.0,1.0,1.0,5354.88
4,C0005,894,0.0,0.0,1.0,1.0,2034.24


## 3. Feature Scaling

### Scale numerical features

In [81]:
numerical_features = ['DaysSinceSignup', 'TotalValue'] + list(customer_categories.columns[1:])
scaler = StandardScaler()
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

print("\nScaled Customer Features Head:")
customer_features.head()


Scaled Customer Features Head:


Unnamed: 0,CustomerID,DaysSinceSignup,Books,Clothing,Electronics,Home Decor,TotalValue
0,C0001,1.152884,0.546536,-1.441153,0.654654,0.67028,-0.051884
1,C0002,1.605593,-1.829707,0.693889,-1.527525,0.67028,-0.862714
2,C0003,-0.713387,-1.829707,0.693889,0.654654,0.67028,-0.393842
3,C0004,0.872636,0.546536,-1.441153,0.654654,0.67028,1.035375
4,C0005,1.042017,-1.829707,-1.441153,0.654654,0.67028,-0.769499


## 4. Similariry Calculations

### Calculate cosine similarity

In [82]:
customer_ids = customer_features['CustomerID']
customer_features = customer_features.drop('CustomerID', axis=1)
similarity_matrix = cosine_similarity(customer_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)

print("\nSimilarity Matrix Head:")
similarity_df.head()


Similarity Matrix Head:


CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,-0.097035,-0.387726,0.870833,0.528335,-0.438626,0.558585,-0.151792,-0.417001,-0.340722,...,0.172324,-0.030118,0.16969,-0.254799,-0.484389,0.029214,0.424299,-0.099346,0.480714,0.087283
C0002,-0.097035,1.0,0.338544,-0.287372,0.464292,0.166254,0.466726,-0.504002,0.317143,0.331323,...,-0.673356,-0.050633,0.042707,-0.54078,0.111643,0.467697,0.369169,0.532846,0.420701,-0.321026
C0003,-0.387726,0.338544,1.0,-0.411784,0.425434,-0.184755,0.387268,0.168566,0.574233,-0.438229,...,-0.341624,-0.28688,-0.68423,0.246949,0.931738,-0.409619,0.531374,0.3522,0.480832,-0.001482
C0004,0.870833,-0.287372,-0.411784,1.0,0.324466,-0.282074,0.396231,0.048765,-0.623146,-0.531303,...,0.156126,-0.24495,0.094601,-0.102501,-0.336001,0.129515,0.236232,-0.354187,0.283039,0.305262
C0005,0.528335,0.464292,0.425434,0.324466,1.0,-0.604937,0.992511,-0.490719,0.337026,-0.353779,...,-0.021902,-0.161516,-0.026931,-0.498012,0.223705,-0.276279,0.971505,0.480197,0.993118,-0.370925


## 5. Generate Lookalike Recommendations

In [83]:
def get_top_lookalikes(customer_id, similarity_df, n=3):
    """
    Returns the top n similar customers for a given customer ID.
    """
    if customer_id not in similarity_df.columns:
        return []
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:n+1]
    return [(cust, score) for cust, score in similar_customers.items()]

# Generate lookalikes for the first 20 customers
lookalike_map = defaultdict(list)
for i in range(1, 21):
    customer_id = f'C{i:04d}'
    lookalikes = get_top_lookalikes(customer_id, similarity_df)
    lookalike_map[customer_id] = lookalikes

print("\nLookalike Map:")
lookalike_map


Lookalike Map:


defaultdict(list,
            {'C0001': [('C0152', 0.9956223606729112),
              ('C0174', 0.9801799111622593),
              ('C0004', 0.8708332712750257)],
             'C0002': [('C0134', 0.9902721372198077),
              ('C0159', 0.9874395608358454),
              ('C0062', 0.9557793921588279)],
             'C0003': [('C0129', 0.9954307962116467),
              ('C0031', 0.9870087229912105),
              ('C0026', 0.9787984036384145)],
             'C0004': [('C0148', 0.9660708716294614),
              ('C0001', 0.8708332712750257),
              ('C0152', 0.8615483904340757)],
             'C0005': [('C0199', 0.9931175568971136),
              ('C0007', 0.9925108867751716),
              ('C0197', 0.971504925376399)],
             'C0006': [('C0185', 0.9679928795677428),
              ('C0187', 0.9608845776686528),
              ('C0124', 0.9542906155503288)],
             'C0007': [('C0005', 0.9925108867751716),
              ('C0199', 0.9764700803727494),
              

## 6. Create Lookalike CSV

In [84]:
lookalike_df = pd.DataFrame(lookalike_map.items(), columns=['cust_id', 'lookalikes'])
lookalike_df['lookalikes'] = lookalike_df['lookalikes'].apply(lambda x: [(cust, str(score)) for cust, score in x])
lookalike_df.to_csv('Gaurav_Wankhede_Lookalike.csv', index=False)

print("\nLookalike.csv created successfully!")


Lookalike.csv created successfully!
