In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

customers = pd.read_csv('/content/drive/My Drive/Zeotap/Customers.csv')
products = pd.read_csv('/content/drive/My Drive/Zeotap/Products.csv')
transactions = pd.read_csv('/content/drive/My Drive/Zeotap/Transactions.csv')

Mounted at /content/drive


In [4]:
df = transactions.merge(customers, on="CustomerID", how="left")
df = df.merge(products, on="ProductID", how="left")

df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [5]:
df = df.drop(columns=['Price_x'])
df = df.rename(columns={'Price_y': 'Price'})

df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,CustomerName,Region,SignupDate,ProductName,Category,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


# Normalizing Total Vale, Quantity and Price

In [7]:
from sklearn.preprocessing import MinMaxScaler

num_cols = ["Quantity", "Price", "TotalValue"]
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [10]:
df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,CustomerName,Region,SignupDate,ProductName,Category,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,0.0,0.144104,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,0.590849
1,T00112,C0146,P067,2024-05-27 22:23:54,0.0,0.144104,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,0.590849
2,T00166,C0127,P067,2024-04-25 07:38:55,0.0,0.144104,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,0.590849
3,T00272,C0087,P067,2024-03-26 22:55:37,0.333333,0.29635,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,0.590849
4,T00363,C0070,P067,2024-03-21 15:10:10,0.666667,0.448596,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,0.590849


# Encoding "Region" and "Category"

In [11]:
ndf = pd.get_dummies(df, columns=['Region', 'Category'], drop_first=False)

ndf.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,CustomerName,SignupDate,ProductName,Price,Region_Asia,Region_Europe,Region_North America,Region_South America,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,T00001,C0199,P067,2024-08-25 12:38:23,0.0,0.144104,Andrea Jenkins,2022-12-03,ComfortLiving Bluetooth Speaker,0.590849,False,True,False,False,False,False,True,False
1,T00112,C0146,P067,2024-05-27 22:23:54,0.0,0.144104,Brittany Harvey,2024-09-04,ComfortLiving Bluetooth Speaker,0.590849,True,False,False,False,False,False,True,False
2,T00166,C0127,P067,2024-04-25 07:38:55,0.0,0.144104,Kathryn Stevens,2024-04-04,ComfortLiving Bluetooth Speaker,0.590849,False,True,False,False,False,False,True,False
3,T00272,C0087,P067,2024-03-26 22:55:37,0.333333,0.29635,Travis Campbell,2024-04-11,ComfortLiving Bluetooth Speaker,0.590849,False,False,False,True,False,False,True,False
4,T00363,C0070,P067,2024-03-21 15:10:10,0.666667,0.448596,Timothy Perez,2022-03-15,ComfortLiving Bluetooth Speaker,0.590849,False,True,False,False,False,False,True,False


In [20]:
from sklearn.neighbors import NearestNeighbors

numeric_columns = ndf.select_dtypes(include=['float64', 'int64']).columns

features = ndf[numeric_columns]

knn = NearestNeighbors(n_neighbors=4, metric='cosine')
knn.fit(features)

lookalikes = {}

for cust_id in ndf['CustomerID'][:20]:
    customer_idx = ndf[ndf['CustomerID'] == cust_id].index[0]

    distances, indices = knn.kneighbors(features.iloc[customer_idx].values.reshape(1, -1))

    similar_customers = []
    similarity_scores = []

    for i, idx in enumerate(indices[0]):
        if ndf['CustomerID'].iloc[idx] != cust_id:
            similar_customers.append(ndf['CustomerID'].iloc[idx])
            similarity_scores.append(1 - distances[0][i])

    lookalikes[cust_id] = list(zip(similar_customers[:3], similarity_scores[:3]))

lookalike_df = pd.DataFrame(list(lookalikes.items()), columns=['CustomerID', 'Lookalikes'])

In [21]:
lookalike_df.to_csv('Lakshit_Upreti_Lookalike.csv', index=False)

lookalike_df

Unnamed: 0,CustomerID,Lookalikes
0,C0199,"[(C0179, 1.0), (C0100, 1.0), (C0177, 1.0)]"
1,C0146,"[(C0179, 1.0), (C0100, 1.0), (C0177, 1.0)]"
2,C0127,"[(C0179, 1.0), (C0100, 1.0), (C0177, 1.0)]"
3,C0087,"[(C0155, 1.0), (C0130, 1.0), (C0075, 1.0)]"
4,C0070,"[(C0051, 1.0), (C0157, 1.0), (C0195, 1.0)]"
5,C0188,"[(C0179, 1.0), (C0100, 1.0), (C0177, 1.0)]"
6,C0195,"[(C0070, 1.0), (C0051, 1.0), (C0157, 1.0)]"
7,C0008,"[(C0179, 1.0), (C0100, 1.0), (C0177, 1.0)]"
8,C0157,"[(C0070, 1.0), (C0051, 1.0), (C0195, 1.0)]"
9,C0130,"[(C0087, 1.0), (C0155, 1.0), (C0075, 1.0)]"
