In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ecommerce-transactions/Products.csv
/kaggle/input/ecommerce-transactions/Customers.csv
/kaggle/input/ecommerce-transactions/Transactions.csv


In [2]:
customers_df = pd.read_csv('/kaggle/input/ecommerce-transactions/Customers.csv')
product_df = pd.read_csv('/kaggle/input/ecommerce-transactions/Products.csv')
transactions_df = pd.read_csv('/kaggle/input/ecommerce-transactions/Transactions.csv')
df = pd.merge(customers_df, transactions_df, on='CustomerID', how='inner')
df = pd.merge(df, product_df, on='ProductID', how='inner')

In [3]:
# TOP 20 customersID for analysis (CustomerID: C0001 - C0020)
customers_subset = df[df['CustomerID'].isin([f'C{i:04}' for i in range(1, 21)])]
customers_subset.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,C0001,Lawrence Carroll,South America,2022-07-10,T00015,P054,2024-01-19 03:12:55,2,114.6,57.3,SoundWave Cookbook,Books,57.3
1,C0001,Lawrence Carroll,South America,2022-07-10,T00932,P022,2024-09-17 09:01:18,3,412.62,137.54,HomeSense Wall Art,Home Decor,137.54
2,C0001,Lawrence Carroll,South America,2022-07-10,T00085,P096,2024-04-08 00:01:00,2,614.94,307.47,SoundWave Headphones,Electronics,307.47
3,C0001,Lawrence Carroll,South America,2022-07-10,T00445,P083,2024-05-07 03:11:44,2,911.44,455.72,ActiveWear Smartwatch,Electronics,455.72
4,C0001,Lawrence Carroll,South America,2022-07-10,T00436,P029,2024-11-02 17:04:16,3,1300.92,433.64,TechPro Headphones,Electronics,433.64


In [4]:
# Calculate total spend and average purchase frequency
customer_spend = customers_subset.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_frequency = customers_subset.groupby('CustomerID')['TransactionID'].count().reset_index()

In [5]:
# Merge the customer spend and frequency
customer_features = pd.merge(customer_spend, customer_frequency, on='CustomerID')
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[['TotalValue', 'TransactionID']])

# Compute cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

In [6]:
# Get top 3 similar customers for each customer
lookalikes = {}
for i in range(20):  # For the first 20 customers (C0001 to C0020)
    customer_id = customer_features['CustomerID'].iloc[i]
    similar_indices = similarity_matrix[i].argsort()[-4:-1][::-1]  # Get top 3 similar customers (excluding itself)
    similar_customers = [(customer_features['CustomerID'].iloc[j], similarity_matrix[i, j]) for j in similar_indices]
    lookalikes[customer_id] = similar_customers

In [7]:
# Create a list of dictionaries to save as CSV
lookalike_map = []
for cust_id, similars in lookalikes.items():
    lookalike_map.append({
        'CustomerID': cust_id,
        'Lookalikes': similars
    })

# Convert to DataFrame and save
lookalike_df = pd.DataFrame(lookalike_map)
lookalike_df.to_csv('Lookalike.csv', index=False)
lookalike_df

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0012, 0.9993606011582863), (C0013, 0.994605..."
1,C0002,"[(C0010, 0.9992480644153787), (C0009, 0.994294..."
2,C0003,"[(C0020, 0.9973033114146811), (C0005, 0.995331..."
3,C0004,"[(C0017, 0.9884753148884934), (C0012, 0.988210..."
4,C0005,"[(C0020, 0.9997306439937342), (C0015, 0.997259..."
5,C0006,"[(C0018, 0.8548018682152774), (C0016, 0.786606..."
6,C0007,"[(C0003, 0.976733914448762), (C0020, 0.9583611..."
7,C0008,"[(C0017, 0.9500218674850054), (C0004, 0.891814..."
8,C0009,"[(C0002, 0.9942948290297401), (C0010, 0.989411..."
9,C0010,"[(C0002, 0.9992480644153787), (C0009, 0.989411..."
