In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [2]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
# Merge datasets
merged_data = pd.merge(transactions, products, on='ProductID', how='left')
merged_data = pd.merge(merged_data, customers, on='CustomerID', how='left')


In [4]:
# Total spending and frequency for each customer
customer_behavior = merged_data.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    purchase_frequency=('TransactionID', 'count'),
    unique_products=('ProductID', 'nunique')
).reset_index()

# Customer demographic features
customer_profile = customers[['CustomerID', 'Region', 'SignupDate']]

# Merge customer behavior and profile
customer_data = pd.merge(customer_behavior, customer_profile, on='CustomerID', how='left')

In [5]:
# One-hot encode categorical 'Region' feature
customer_data = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)

# Scale numerical features
scaler = StandardScaler()
customer_data_scaled = scaler.fit_transform(customer_data[['total_spending', 'purchase_frequency', 'unique_products']])

# Combine the scaled data with the encoded data
customer_data_scaled = np.hstack([customer_data_scaled, customer_data.drop(['CustomerID', 'SignupDate'], axis=1).values])

In [6]:
# Compute cosine similarity between all customers
cosine_sim = cosine_similarity(customer_data_scaled)

In [7]:
# Function to get top 3 lookalikes
def get_top_lookalikes(customer_id, num_recommendations=3):
    customer_idx = customer_data[customer_data['CustomerID'] == customer_id].index[0]
    similarity_scores = cosine_sim[customer_idx]
    similar_customers = list(enumerate(similarity_scores))
    
    # Sort customers by similarity score in descending order and exclude the customer itself
    similar_customers = sorted(similar_customers, key=lambda x: x[1], reverse=True)[1:num_recommendations+1]
    
    # Return a list of tuples (CustomerID, Similarity Score)
    return [(customer_data.iloc[i[0]]['CustomerID'], i[1]) for i in similar_customers]

# Get lookalikes for the first 20 customers
lookalikes = {}
for customer_id in customers['CustomerID'][:20]:
    lookalikes[customer_id] = get_top_lookalikes(customer_id)

# Flatten the lookalikes dictionary into a list of lists and create a DataFrame
lookalike_data = []
for customer_id, similar_customers in lookalikes.items():
    for similar_customer in similar_customers:
        lookalike_data.append([customer_id, similar_customer[0], similar_customer[1]])

# Convert the lookalike data to a DataFrame
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'Lookalike CustomerID', 'Similarity Score'])

# Save to CSV
lookalike_df.to_csv('Neel_Shah_Lookalike.csv', index=False)
