In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
folder_path = "/content/drive/MyDrive/ZeoTap/"

In [4]:
customers = pd.read_csv(folder_path + "Customers.csv")
products = pd.read_csv(folder_path + "Products.csv")
transactions = pd.read_csv(folder_path + "Transactions.csv")

In [5]:
#merging the documents
transactions = pd.merge(transactions, products, on='ProductID')

In [6]:
customer_features = customers.copy()

In [7]:
customer_spending = transactions.groupby('CustomerID')['TotalValue'].agg(['sum', 'mean']).reset_index()

In [8]:
customer_spending.columns = ['CustomerID', 'TotalSpending', 'AvgTransactionValue']

In [9]:
#adding spending pattern in features
customer_features = pd.merge(customer_features, customer_spending, on='CustomerID', how='left')

In [10]:
customer_favorite = transactions.groupby(['CustomerID', 'Category']).size().reset_index(name='Count')
customer_favorite = customer_favorite.loc[customer_favorite.groupby('CustomerID')['Count'].idxmax()]
customer_favorite = customer_favorite[['CustomerID', 'Category']]

In [11]:
#adding customer favorite to features
customer_features = pd.merge(customer_features, customer_favorite, on='CustomerID' , how='left')

In [12]:
#hot-encoding
customer_features = pd.get_dummies(customer_features, columns=['Region', 'Category'], drop_first=True)

In [13]:
customer_features = customer_features.fillna(0)

In [14]:
scaler = StandardScaler()
customer_features_scaled = scaler.fit_transform(customer_features.drop(columns=['CustomerID', 'CustomerName', 'SignupDate']))

In [15]:
similarity_matrix = cosine_similarity(customer_features_scaled)

In [16]:
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [19]:
lookalike_map = {}

first20_cust = customer_features['CustomerID'].head(20)
for customer in first20_cust:
    similarities = similarity_df[customer].sort_values(ascending=False)
    top3_lookalikes = similarities.iloc[1:4].reset_index()
    top3_lookalikes.columns = ['LookalikeCustomerID', 'SimilarityScore']
    lookalike_map[customer] = list(zip(top3_lookalikes['LookalikeCustomerID'], top3_lookalikes['SimilarityScore']))

In [20]:
print(lookalike_map)

{'C0001': [('C0190', 0.9905906677450343), ('C0181', 0.9849202070578691), ('C0048', 0.9830851841741798)], 'C0002': [('C0088', 0.9964495430496728), ('C0134', 0.9803261391388983), ('C0106', 0.9556954898916127)], 'C0003': [('C0052', 0.995018475023706), ('C0152', 0.9913750638921187), ('C0195', 0.9660483848392644)], 'C0004': [('C0155', 0.9833884707323975), ('C0169', 0.9670081212021147), ('C0126', 0.966769697993738)], 'C0005': [('C0146', 0.9907410842986621), ('C0186', 0.9866439362832602), ('C0007', 0.9429406636150458)], 'C0006': [('C0171', 0.9804198931915868), ('C0187', 0.9747834362488274), ('C0168', 0.9717290724506122)], 'C0007': [('C0140', 0.9832238854207268), ('C0115', 0.9543462203866273), ('C0005', 0.9429406636150458)], 'C0008': [('C0038', 0.9502785460622898), ('C0189', 0.9497103534085418), ('C0160', 0.949480350493803)], 'C0009': [('C0010', 0.9838602711356388), ('C0111', 0.9831018944328871), ('C0198', 0.9818764357780986)], 'C0010': [('C0111', 0.9979590079882943), ('C0198', 0.9903846203673

In [34]:
lookalike_df = pd.DataFrame.from_dict(lookalike_map, orient='index')
lookalike_df = lookalike_df.stack().reset_index()
lookalike_df.columns = ['CustomerID', 'LookalikeIndex', 'LookalikeInfo']


In [35]:
print(lookalike_df)

   CustomerID  LookalikeIndex                LookalikeInfo
0       C0001               0  (C0190, 0.9905906677450343)
1       C0001               1  (C0181, 0.9849202070578691)
2       C0001               2  (C0048, 0.9830851841741798)
3       C0002               0  (C0088, 0.9964495430496728)
4       C0002               1  (C0134, 0.9803261391388983)
5       C0002               2  (C0106, 0.9556954898916127)
6       C0003               0   (C0052, 0.995018475023706)
7       C0003               1  (C0152, 0.9913750638921187)
8       C0003               2  (C0195, 0.9660483848392644)
9       C0004               0  (C0155, 0.9833884707323975)
10      C0004               1  (C0169, 0.9670081212021147)
11      C0004               2   (C0126, 0.966769697993738)
12      C0005               0  (C0146, 0.9907410842986621)
13      C0005               1  (C0186, 0.9866439362832602)
14      C0005               2  (C0007, 0.9429406636150458)
15      C0006               0  (C0171, 0.980419893191586

In [36]:
lookalike_df[['LookalikeCustomerID', 'SimilarityScore']] = pd.DataFrame(lookalike_df['LookalikeInfo'].tolist(), index=lookalike_df.index)
lookalike_df = lookalike_df.drop(columns=['LookalikeIndex', 'LookalikeInfo'])

In [37]:
print(lookalike_df)

   CustomerID LookalikeCustomerID  SimilarityScore
0       C0001               C0190         0.990591
1       C0001               C0181         0.984920
2       C0001               C0048         0.983085
3       C0002               C0088         0.996450
4       C0002               C0134         0.980326
5       C0002               C0106         0.955695
6       C0003               C0052         0.995018
7       C0003               C0152         0.991375
8       C0003               C0195         0.966048
9       C0004               C0155         0.983388
10      C0004               C0169         0.967008
11      C0004               C0126         0.966770
12      C0005               C0146         0.990741
13      C0005               C0186         0.986644
14      C0005               C0007         0.942941
15      C0006               C0171         0.980420
16      C0006               C0187         0.974783
17      C0006               C0168         0.971729
18      C0007               C01

In [38]:
lookalike_df.to_csv('Lookalike.csv', index=False)