In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np
import seaborn as sns

In [2]:
customer_df = pd.read_csv('Customers.csv')
product_df = pd.read_csv('Products.csv')
transaction_df = pd.read_csv('Transactions.csv')

In [3]:
merged_data = transaction_df.merge(customer_df, on='CustomerID', how='left')
merged_data = merged_data.merge(product_df, on='ProductID', how='left')

In [4]:
merged_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [5]:
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],
    'Quantity': 'sum',
    'Category': lambda x: x.mode()[0]  # Most purchased category
}).reset_index()

In [6]:
customer_features

Unnamed: 0_level_0,CustomerID,TotalValue,TotalValue,Quantity,Category
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,<lambda>
0,C0001,3354.52,670.904000,12,Electronics
1,C0002,1862.74,465.685000,10,Clothing
2,C0003,2725.38,681.345000,14,Home Decor
3,C0004,5354.88,669.360000,23,Books
4,C0005,2034.24,678.080000,7,Electronics
...,...,...,...,...,...
194,C0196,4982.88,1245.720000,12,Home Decor
195,C0197,1928.65,642.883333,9,Electronics
196,C0198,931.83,465.915000,3,Clothing
197,C0199,1979.28,494.820000,9,Electronics


In [7]:
customer_features.columns = ['CustomerID', 'TotalSpending', 'AvgSpending', 'TotalQuantity', 'FavoriteCategory']

In [8]:
customer_features = pd.get_dummies(customer_features, columns=['FavoriteCategory'], drop_first=True)

In [9]:
customer_features

Unnamed: 0,CustomerID,TotalSpending,AvgSpending,TotalQuantity,FavoriteCategory_Clothing,FavoriteCategory_Electronics,FavoriteCategory_Home Decor
0,C0001,3354.52,670.904000,12,False,True,False
1,C0002,1862.74,465.685000,10,True,False,False
2,C0003,2725.38,681.345000,14,False,False,True
3,C0004,5354.88,669.360000,23,False,False,False
4,C0005,2034.24,678.080000,7,False,True,False
...,...,...,...,...,...,...,...
194,C0196,4982.88,1245.720000,12,False,False,True
195,C0197,1928.65,642.883333,9,False,True,False
196,C0198,931.83,465.915000,3,True,False,False
197,C0199,1979.28,494.820000,9,False,True,False


In [10]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

In [11]:
similarity_matrix = cosine_similarity(scaled_features)

In [12]:
def get_top_3_similar(customers_list, similarity_matrix):
    lookalike_map = {}
    for i, cust_id in enumerate(customers_list):
        # Get similarity scores for the current customer
        scores = similarity_matrix[i]
        # Get the indices of the top 3 most similar customers (excluding itself)
        top_indices = np.argsort(scores)[::-1][1:4]
        # Map customer to top 3 similar customers and their scores
        lookalike_map[cust_id] = [(customers_list[j], scores[j]) for j in top_indices]
    return lookalike_map

In [13]:
first_20_customers = customer_df['CustomerID'][:20].tolist()
lookalike_map = get_top_3_similar(first_20_customers, similarity_matrix[:20, :20])

In [14]:
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_map.keys(),
    'Lookalikes': [str(value) for value in lookalike_map.values()]
})
lookalike_df.to_csv('Lookalike.csv', index=False)