In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Load data
customers = pd.read_csv("Customers.csv")  # Customer profiles
products = pd.read_csv("Products.csv")  # Product details
transactions = pd.read_csv("Transactions.csv")  # Transaction history

# Data preprocessing
# Handle missing data
customers = customers.dropna(subset=["CustomerID", "Region"])  # Drop rows with missing essential info
transactions = transactions.dropna(subset=["CustomerID", "TotalValue", "ProductID"])  # Drop rows with missing essential info

# Step 1: Merge transactions with product details
transactions = pd.merge(transactions, products[["ProductID", "ProductName", "Category", "Price"]],
                        on="ProductID", how="left")

# Aggregate transaction data: Calculate total transaction value, purchase frequency, and dominant product category
transaction_features = transactions.groupby("CustomerID").agg({
    "TotalValue": "mean",
    "TransactionID": "count",
    "Category": lambda x: x.value_counts().idxmax(),
    "ProductID": lambda x: x.nunique()  # Count distinct products purchased by the customer
}).rename(columns={"TotalValue": "AvgTransactionValue",
                   "TransactionID": "PurchaseFrequency",
                   "Category": "DominantProductCategory",
                   "ProductID": "DistinctProductCount"})

# Merge transaction features with customer data
data = pd.merge(customers, transaction_features, on="CustomerID", how="left")

# Handle missing values in merged data
data.fillna({"AvgTransactionValue": data["AvgTransactionValue"].mean(),
             "PurchaseFrequency": 0,
             "DominantProductCategory": "Unknown",
             "DistinctProductCount": 0}, inplace=True)

# Encode categorical features (e.g., region, product category)
data = pd.get_dummies(data, columns=["Region", "DominantProductCategory"], drop_first=True)

# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = ["AvgTransactionValue", "PurchaseFrequency", "DistinctProductCount"]
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Compute similarity
# Step 2: Select relevant features for similarity computation
feature_columns = ["AvgTransactionValue", "PurchaseFrequency", "DistinctProductCount"] + \
                  [col for col in data.columns if "Region_" in col or "DominantProductCategory_" in col]

# Calculate pairwise similarity
similarity_matrix = cosine_similarity(data[feature_columns])

# Step 3: Generate Lookalike Recommendations
# Store top 3 lookalikes for each customer
lookalike_map = {}

for i in range(min(20, len(data))):  # For first 20 customers or less if there are fewer than 20
    customer_id = data.iloc[i]["CustomerID"]
    # Get similarity scores for this customer
    scores = list(enumerate(similarity_matrix[i]))
    # Sort by similarity score (excluding self-comparison)
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 lookalikes
    # Extract customer IDs and similarity scores
    lookalikes = [(data.iloc[j]["CustomerID"], round(score, 3)) for j, score in scores]
    lookalike_map[customer_id] = lookalikes

# Save to Lookalike.csv
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_map.keys(),
    "Lookalikes": [str(v) for v in lookalike_map.values()]
})
lookalike_df.to_csv("Lookalike.csv", index=False)

# Output the lookalike map for the first 20 customers
print(lookalike_map)


{'C0001': [('C0107', 1.0), ('C0190', 0.999), ('C0048', 0.999)], 'C0002': [('C0178', 0.991), ('C0159', 0.982), ('C0136', 0.978)], 'C0003': [('C0133', 1.0), ('C0052', 0.998), ('C0152', 0.997)], 'C0004': [('C0165', 0.989), ('C0169', 0.987), ('C0126', 0.987)], 'C0005': [('C0186', 0.998), ('C0007', 0.994), ('C0146', 0.994)], 'C0006': [('C0171', 0.996), ('C0187', 0.992), ('C0011', 0.979)], 'C0007': [('C0115', 0.995), ('C0005', 0.994), ('C0140', 0.993)], 'C0008': [('C0065', 0.991), ('C0024', 0.984), ('C0194', 0.983)], 'C0009': [('C0010', 0.994), ('C0062', 0.989), ('C0111', 0.986)], 'C0010': [('C0111', 0.997), ('C0103', 0.997), ('C0009', 0.994)], 'C0011': [('C0137', 0.999), ('C0153', 0.997), ('C0126', 0.997)], 'C0012': [('C0195', 0.998), ('C0181', 0.996), ('C0113', 0.995)], 'C0013': [('C0099', 0.998), ('C0108', 0.998), ('C0129', 0.958)], 'C0014': [('C0060', 0.991), ('C0166', 0.956), ('C0119', 0.939)], 'C0015': [('C0131', 0.993), ('C0125', 0.975), ('C0094', 0.951)], 'C0016': [('C0183', 1.0), ('