In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [6]:
#load and read datasets
customers=pd.read_csv('Customers.csv')
products=pd.read_csv('Products.csv')
transactions=pd.read_csv('Transactions.csv')


In [32]:
# Merge datasets
transactions.rename(columns={'Price': 'Price_Transactions'}, inplace=True)
products.rename(columns={'Price': 'Price_Products'}, inplace=True)
data=transactions.merge(customers,on='CustomerID',how='left').merge(products,on='ProductID',how='left')

In [34]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_Transactions,CustomerName,Region,SignupDate,ProductName,Category,Price_Products
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [36]:
#prepare data for lookalike model

lookalike_data=data.groupby('CustomerID').agg({
    'TotalValue':'sum',
    'Quantity': 'sum',
    'Price_Products':'mean', #used producs price for caluculations
    'Region':'first'
}).reset_index()

In [38]:
# categorical data

lookalike_data['Region']=lookalike_data['Region'].astype('category').cat.codes

In [40]:
#standardize data
scaler=StandardScaler()
sca_features=scaler.fit_transform(lookalike_data.drop('CustomerID',axis=1))

In [42]:
#similarity matrix
sim_matrix=cosine_similarity(sca_features)
sim_df=pd.DataFrame(sim_matrix,index=lookalike_data['CustomerID'], columns=lookalike_data['CustomerID'])

In [44]:
#generate lookalike recommendations

recommendations={}
for i in sim_df.index[:20]:
    sim_customers=sim_df[i].sort_values(ascending=False).index[1:4]
    scores=sim_df.loc[i,sim_customers]
    recommendations[i]=list(zip(sim_customers,scores))

In [48]:
#save csv
lookalike_df=pd.DataFrame({
    'CistomerID':recommendations.keys(),
    'SimilarCustomers':[str(recommendations[c]) for c in recommendations.keys()]
})
lookalike_df.to_csv('kalyani_botla_Lookalike.csv',index=False)