In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
# Load the datasets
customers_df = pd.read_csv('Customers.csv')  # Customer data (including profiles)
products_df = pd.read_csv('Products.csv')  # Product data
transactions_df = pd.read_csv('Transactions.csv')  # Transaction data

In [2]:
# Merge customer and product data with transactions
customer_transactions = pd.merge(transactions_df, customers_df, on='CustomerID', how='inner')
customer_transactions = pd.merge(customer_transactions, products_df, on='ProductID', how='inner')
# Aggregate transaction data by customer
transaction_summary = customer_transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'ProductID': 'nunique',  # Number of unique products purchased
    'TransactionID': 'count',  # Frequency of transactions
    'TransactionDate': 'max'  # Recency of the last purchase
}).rename(columns={'TotalValue': 'TotalSpending', 'ProductID': 'UniqueProducts', 'TransactionID': 'PurchaseFrequency', 'TransactionDate': 'LastPurchaseDate'})

In [3]:
# Convert 'LastPurchaseDate' to datetime and calculate recency
transaction_summary['LastPurchaseDate'] = pd.to_datetime(transaction_summary['LastPurchaseDate'], errors='coerce')
transaction_summary['Recency'] = (pd.to_datetime('today') - pd.to_datetime(transaction_summary['LastPurchaseDate'])).dt.days
transaction_summary.drop(columns=['LastPurchaseDate'], inplace=True)
# Merge with customer region information
profile_data = pd.merge(customers_df[['CustomerID', 'Region']], transaction_summary, on='CustomerID', how='inner')
# One-hot encode 'Region'
profile_data = pd.get_dummies(profile_data, columns=['Region'], drop_first=True)
# Normalize the data (excluding 'CustomerID')
scaler = StandardScaler()
scaled_data = scaler.fit_transform(profile_data.drop(columns=['CustomerID']))
# Display processed data
print(profile_data.head())

  CustomerID  TotalSpending  UniqueProducts  PurchaseFrequency  Recency  \
0      C0001        3354.52               5                  5       82   
1      C0002        1862.74               4                  4       52   
2      C0003        2725.38               4                  4      152   
3      C0004        5354.88               8                  8       32   
4      C0005        2034.24               3                  3       81   

   Region_Europe  Region_North America  Region_South America  
0          False                 False                  True  
1          False                 False                 False  
2          False                 False                  True  
3          False                 False                  True  
4          False                 False                 False  


In [4]:
# Calculate Cosine Similarity Matrix
similarity_matrix = cosine_similarity(scaled_data)
# Check similarity between the first customer and others
print(similarity_matrix[0])

[ 1.         -0.04803792  0.79464575  0.61829043 -0.08425857  0.88179824
 -0.17165257 -0.19250443 -0.34825802 -0.38064664  0.92443266  0.74958455
  0.661982   -0.2892695  -0.31813323 -0.43250839 -0.29027758 -0.47454917
 -0.40167397 -0.29297868 -0.08740635 -0.1916711  -0.40827851 -0.39706972
  0.70198557 -0.45804939 -0.15716055 -0.04059295 -0.30439205 -0.35303734
  0.86939843  0.67738084 -0.28357356 -0.42250142 -0.44474777 -0.35016179
 -0.51001388 -0.36057784  0.90365394 -0.189138   -0.29672717 -0.3568701
 -0.11233312 -0.41646702 -0.1481727  -0.3718409  -0.40134387  0.96082202
 -0.33211486 -0.36078064 -0.30474226  0.74362343 -0.41596018 -0.08221804
 -0.46847238 -0.03025463 -0.36228523 -0.28647564 -0.27820847 -0.27365252
 -0.32749314 -0.38492169 -0.33397651 -0.41244072 -0.18372754 -0.45313449
 -0.43609028 -0.24420299 -0.39747789 -0.48447068  0.63609048 -0.43570913
 -0.36856643 -0.44573902 -0.23340728  0.9366327   0.68376746 -0.02526999
 -0.47084319 -0.15337962 -0.43500822  0.54748355  0.

In [5]:
def get_top_similar_customers(customer_id, similarity_matrix, top_n=3):
    customer_idx = profile_data[profile_data['CustomerID'] == customer_id].index[0]
    similarity_scores = similarity_matrix[customer_idx]
    # Get indices of top similar customers (exclude self)
    similar_customer_idx = similarity_scores.argsort()[-top_n-1:-1]
    similar_customers = profile_data.iloc[similar_customer_idx][['CustomerID']]
    scores = similarity_scores[similar_customer_idx]
    recommendations = list(zip(similar_customers['CustomerID'], scores))
    return recommendations
# Example: Get top 3 similar customers for CustomerID 'C0001'
recommendations = get_top_similar_customers('C0001', similarity_matrix)
print(recommendations)

[('C0048', 0.9608220170182876), ('C0107', 0.9917930589676538), ('C0152', 0.9944852815216094)]


In [6]:
# Generate Lookalike CSV for the first 20 customers
lookalike_list = []
for customer_id in profile_data['CustomerID'][:20]:  # For first 20 customers (C0001 - C0020)
    recommendations = get_top_similar_customers(customer_id, similarity_matrix)
    for rec in recommendations:
        lookalike_list.append([customer_id, rec[0], rec[1]])
# Create DataFrame for the Lookalike Recommendations
lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])
# Save the Lookalike recommendations to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)
print("Lookalike recommendations saved to Lookalike.csv")

Lookalike recommendations saved to Lookalike.csv
