In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

In [7]:
# Load the data
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

In [8]:
# Data Preprocessing
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

In [9]:
# Merge customer data with transactions to get a complete view of each customer's activity
df = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')
df = pd.merge(df, products_df, on='ProductID', how='left')

In [10]:
# Feature Engineering
# Total Spending per Customer
customer_spending = df.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_spending.rename(columns={'TotalValue': 'TotalSpent'}, inplace=True)

In [11]:
# Frequency of Transactions per Customer
customer_transactions = df.groupby('CustomerID').size().reset_index(name='TransactionCount')

In [12]:
# Recency of Last Purchase (how many days since the last purchase)
recent_purchase = df.groupby('CustomerID')['TransactionDate'].max().reset_index()
recent_purchase['Recency'] = recent_purchase['TransactionDate'].apply(lambda x: (datetime.now() - x).days)

In [13]:
# Merge all features into one DataFrame
customer_features = pd.merge(customer_spending, customer_transactions, on='CustomerID')
customer_features = pd.merge(customer_features, recent_purchase[['CustomerID', 'Recency']], on='CustomerID')

In [14]:
# Normalize the features
scaler = StandardScaler()
customer_features_scaled = pd.DataFrame(scaler.fit_transform(customer_features[['TotalSpent', 'TransactionCount', 'Recency']]),
                                       columns=['TotalSpent', 'TransactionCount', 'Recency'])

In [15]:
# Calculate Cosine Similarity between customers
similarity_matrix = cosine_similarity(customer_features_scaled)

In [19]:
# Find top 3 most similar customers for the first 20 customers (C0001 to C0020)
lookalike_dict = {}

for i in range(20):
    # Get the top 3 similar customers (excluding the customer themselves)
    similar_customers = list(enumerate(similarity_matrix[i]))
    similar_customers = sorted(similar_customers, key=lambda x: x[1], reverse=True)[1:4]  # Exclude the customer itself
    top_3_similar = [(customer_features.iloc[idx[0]]['CustomerID'], round(idx[1], 4)) for idx in similar_customers]
    lookalike_dict[customer_features.iloc[i]['CustomerID']] = top_3_similar

In [21]:
# Convert the lookalike mapping to DataFrame with the correct number of columns
lookalike_data = []
for customer_id, lookalikes in lookalike_dict.items():
    row = [customer_id]
    for lookalike, score in lookalikes:
        row.append(lookalike)
        row.append(score)
    lookalike_data.append(row)

In [22]:
# Create the DataFrame with proper columns
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3'])

In [29]:
# Save the result to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

In [30]:
# Display the rows of the Lookalike.csv file
lookalike_df.head(20)

Unnamed: 0,CustomerID,Lookalike1,Score1,Lookalike2,Score2,Lookalike3,Score3
0,C0001,C0190,0.9978,C0191,0.9972,C0056,0.9963
1,C0002,C0031,0.9977,C0083,0.9878,C0029,0.9856
2,C0003,C0112,0.9994,C0097,0.9988,C0144,0.9983
3,C0004,C0101,0.999,C0068,0.9922,C0136,0.9918
4,C0005,C0123,0.9998,C0036,0.9982,C0078,0.9931
5,C0006,C0168,0.8991,C0026,0.8124,C0066,0.7889
6,C0007,C0120,0.9912,C0150,0.9795,C0130,0.9766
7,C0008,C0084,0.994,C0194,0.9659,C0017,0.9639
8,C0009,C0077,0.985,C0027,0.9832,C0166,0.9824
9,C0010,C0083,0.9955,C0032,0.9937,C0031,0.9871
