In [20]:
# Import libraries
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#Load Datasets
customers = pd.read_csv(r"C:\Users\panch\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\panch\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Users\panch\Downloads\Transactions.csv")

# Merging datasets to get customer transaction history
combined = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')
display(combined.head())

customer_profile = combined.groupby('CustomerID').agg({
    'TotalValue': 'sum',       # Total spending
    'Quantity': 'sum',         # Total quantity purchased
    'ProductID': 'nunique'     # Number of unique products purchased
}).reset_index()
display(customer_profile)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


Unnamed: 0,CustomerID,TotalValue,Quantity,ProductID
0,C0001,3354.52,12,5
1,C0002,1862.74,10,4
2,C0003,2725.38,14,4
3,C0004,5354.88,23,8
4,C0005,2034.24,7,3
...,...,...,...,...
194,C0196,4982.88,12,3
195,C0197,1928.65,9,3
196,C0198,931.83,3,2
197,C0199,1979.28,9,4


In [3]:
# Merge additional customer features like Region and SignupDate
customer_profile = customer_profile.merge(customers[['CustomerID', 'Region','SignupDate']], on='CustomerID', how='left')
display(customer_profile)

Unnamed: 0,CustomerID,TotalValue,Quantity,ProductID,Region_x,Region_y,SignupDate
0,C0001,3354.52,12,5,South America,South America,2022-07-10
1,C0002,1862.74,10,4,Asia,Asia,2022-02-13
2,C0003,2725.38,14,4,South America,South America,2024-03-07
3,C0004,5354.88,23,8,South America,South America,2022-10-09
4,C0005,2034.24,7,3,Asia,Asia,2022-08-15
...,...,...,...,...,...,...,...
194,C0196,4982.88,12,3,Europe,Europe,2022-06-07
195,C0197,1928.65,9,3,Europe,Europe,2023-03-21
196,C0198,931.83,3,2,Europe,Europe,2022-02-27
197,C0199,1979.28,9,4,Europe,Europe,2022-12-03


In [21]:
# Normalize the features for similarity calculation
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profile[['TotalValue', 'Quantity', 'ProductID']])
#display(scaled_features)

In [22]:
# Dimensionality reduction (optional) for improved performance and better visualization
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(scaled_features)
#display(reduced_features)

In [11]:
# Calculate cosine similarity between all customers
similarity_matrix = cosine_similarity(reduced_features)
print(similarity_matrix)

[[ 1.          0.8635945   0.57167192 ...  0.68450213  0.83982069
  -0.96618185]
 [ 0.8635945   1.          0.90736925 ...  0.95869008  0.99896945
  -0.96439983]
 [ 0.57167192  0.90736925  1.         ...  0.98945116  0.92551212
  -0.76390996]
 ...
 [ 0.68450213  0.95869008  0.98945116 ...  1.          0.97061272
  -0.84933748]
 [ 0.83982069  0.99896945  0.92551212 ...  0.97061272  1.
  -0.95140329]
 [-0.96618185 -0.96439983 -0.76390996 ... -0.84933748 -0.95140329
   1.        ]]


In [12]:
# Function to get top 3 lookalikes for a customer
def get_top_lookalikes(customer_id, top_n=3):
    customer_index = customer_profile[customer_profile['CustomerID'] == customer_id].index[0]
    similarity_scores = list(enumerate(similarity_matrix[customer_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]  # Exclude the customer itself
    lookalikes = [(customer_profile.iloc[i[0]]['CustomerID'], round(i[1], 4)) for i in similarity_scores]
    return lookalikes

In [13]:
# Generate lookalikes for first 20 customers (C0001 - C0020)
lookalike_results = {}
for customer_id in customer_profile['CustomerID'][:20]:
    lookalike_results[customer_id] = get_top_lookalikes(customer_id)

In [24]:
# Create DataFrame to store results
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalikes} 
    for cust_id, lookalikes in lookalike_results.items()
])
print(lookalike_df)
# Save results to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)
# Specify the full path where you want to save the CSV
lookalike_df.to_csv('C:/Users/panch/Downloads/Lookalike.csv', index=False)


# Display the first few rows
lookalike_df

   CustomerID                                         Lookalikes
0       C0001  [(C0056, 0.9998), (C0174, 0.9995), (C0055, 0.9...
1       C0002  [(C0029, 0.9998), (C0025, 0.9997), (C0031, 0.9...
2       C0003         [(C0167, 1.0), (C0042, 1.0), (C0133, 1.0)]
3       C0004   [(C0075, 1.0), (C0091, 0.9997), (C0108, 0.9996)]
4       C0005      [(C0095, 1.0), (C0197, 1.0), (C0112, 0.9999)]
5       C0006  [(C0079, 0.9999), (C0196, 0.9992), (C0117, 0.9...
6       C0007   [(C0193, 1.0), (C0120, 0.9999), (C0176, 0.9999)]
7       C0008  [(C0179, 0.9997), (C0194, 0.9977), (C0081, 0.9...
8       C0009  [(C0077, 0.9997), (C0043, 0.9996), (C0142, 0.9...
9       C0010  [(C0094, 0.9995), (C0121, 0.9993), (C0152, 0.9...
10      C0011  [(C0018, 0.9988), (C0028, 0.9982), (C0143, 0.9...
11      C0012   [(C0100, 1.0), (C0054, 0.9999), (C0087, 0.9999)]
12      C0013      [(C0105, 1.0), (C0045, 1.0), (C0041, 0.9999)]
13      C0014         [(C0015, 1.0), (C0131, 1.0), (C0151, 1.0)]
14      C0015         [(C

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0056, 0.9998), (C0174, 0.9995), (C0055, 0.9..."
1,C0002,"[(C0029, 0.9998), (C0025, 0.9997), (C0031, 0.9..."
2,C0003,"[(C0167, 1.0), (C0042, 1.0), (C0133, 1.0)]"
3,C0004,"[(C0075, 1.0), (C0091, 0.9997), (C0108, 0.9996)]"
4,C0005,"[(C0095, 1.0), (C0197, 1.0), (C0112, 0.9999)]"
5,C0006,"[(C0079, 0.9999), (C0196, 0.9992), (C0117, 0.9..."
6,C0007,"[(C0193, 1.0), (C0120, 0.9999), (C0176, 0.9999)]"
7,C0008,"[(C0179, 0.9997), (C0194, 0.9977), (C0081, 0.9..."
8,C0009,"[(C0077, 0.9997), (C0043, 0.9996), (C0142, 0.9..."
9,C0010,"[(C0094, 0.9995), (C0121, 0.9993), (C0152, 0.9..."
