In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
merged_data = transactions.merge(customers, on="CustomerID", how="left")
merged_data = merged_data.merge(products, on="ProductID", how="left")

# Feature Engineering
# 1. Aggregate Transaction Data
customer_features = merged_data.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    avg_transaction_value=("TotalValue", "mean"),
    total_transactions=("TransactionID", "nunique"),
    most_common_category=("Category", lambda x: x.mode()[0] if not x.mode().empty else None)
).reset_index()

# 2. Merge Profile Data
customer_profile = customers[["CustomerID", "Region", "SignupDate"]]
customer_features = customer_features.merge(customer_profile, on="CustomerID", how="left")

# Handle Missing 'SignupDate' (if any)
if customer_features["SignupDate"].isnull().any():
    print("Found missing values in 'SignupDate'. Filling them with a default date.")
    customer_features["SignupDate"].fillna("2000-01-01", inplace=True)

# Convert 'SignupDate' to datetime and calculate 'days_since_signup'
customer_features["SignupDate"] = pd.to_datetime(customer_features["SignupDate"])
customer_features["days_since_signup"] = (pd.Timestamp.now() - customer_features["SignupDate"]).dt.days

# Drop 'SignupDate' after extracting useful information
customer_features = customer_features.drop(columns=["SignupDate"])

# One-Hot Encode 'Region' and 'most_common_category'
encoder = OneHotEncoder()
encoded_categories = encoder.fit_transform(customer_features[["most_common_category"]]).toarray()
encoded_category_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(["most_common_category"]))
customer_features = pd.concat([customer_features.reset_index(drop=True), encoded_category_df], axis=1)
customer_features = customer_features.drop(columns=["most_common_category"])

customer_features = pd.get_dummies(customer_features, columns=["Region"], drop_first=True)

# Normalize Numerical Features
numerical_columns = ["total_spent", "avg_transaction_value", "total_transactions", "days_since_signup"]
scaler = StandardScaler()
customer_features[numerical_columns] = scaler.fit_transform(customer_features[numerical_columns])

# Compute Similarity
customer_ids = customer_features["CustomerID"]
features = customer_features.drop(columns=["CustomerID"])
similarity_matrix = cosine_similarity(features)

# Generate Lookalike Recommendations
lookalikes = {}
for idx, customer_id in enumerate(customer_ids):
    similarities = list(enumerate(similarity_matrix[idx]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 similar customers
    lookalikes[customer_id] = [(customer_ids[i], round(score, 4)) for i, score in similarities]

# Create Lookalike.csv
lookalike_data = []
for customer_id, similar_customers in lookalikes.items():
    row = [customer_id]
    for similar_customer, score in similar_customers:
        row.extend([similar_customer, score])
    lookalike_data.append(row)

lookalike_df = pd.DataFrame(
    lookalike_data,
    columns=["CustomerID", "Lookalike1", "Score1", "Lookalike2", "Score2", "Lookalike3", "Score3"]
)

# Save the Lookalike.csv file
lookalike_df = pd.read_csv("Lookalike.csv")
print(lookalike_df.head(20))


   CustomerID Lookalike1  Score1 Lookalike2  Score2 Lookalike3  Score3
0       C0001      C0192  0.8521      C0184  0.8512      C0112  0.8428
1       C0002      C0134  0.9573      C0106  0.9243      C0029  0.9107
2       C0003      C0052  0.9864      C0031  0.9092      C0076  0.8411
3       C0004      C0165  0.9724      C0155  0.9463      C0173  0.8984
4       C0005      C0007  0.9097      C0112  0.8898      C0095  0.8242
5       C0006      C0187  0.9007      C0168  0.8717      C0171  0.8195
6       C0007      C0005  0.9097      C0120  0.8184      C0140  0.7979
7       C0008      C0065  0.7617      C0084  0.7602      C0098  0.7584
8       C0009      C0010  0.9205      C0062  0.8758      C0077  0.8681
9       C0010      C0062  0.9519      C0009  0.9205      C0198  0.9139
10      C0011      C0169  0.8969      C0153  0.8916      C0174  0.8702
11      C0012      C0195  0.9264      C0136  0.8578      C0013  0.7861
12      C0013      C0143  0.9146      C0022  0.8478      C0087  0.8404
13    