In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
customers = pd.read_csv("/content/sample_data/Customers.csv")
products = pd.read_csv("/content/sample_data/Products.csv")
transactions = pd.read_csv("/content/sample_data/Transactions.csv")

In [3]:
merged_data = transactions.merge(products, on='ProductID')
merged_data = merged_data.merge(customers, on='CustomerID')

In [4]:
# Aggregate transaction data by customer
customer_profile = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',                  # Total spending
    'Quantity': 'sum',                   # Total quantity purchased
    'Category': lambda x: ','.join(x),   # Categories purchased
    'Region': 'first',                   # Customer region
}).reset_index()

# Add one-hot encoding for Region and Category
encoder = OneHotEncoder()
region_encoded = encoder.fit_transform(customer_profile[['Region']]).toarray()
category_encoded = encoder.fit_transform(customer_profile[['Category']]).toarray()

# Combine numerical and encoded features
numerical_features = customer_profile[['TotalValue', 'Quantity']].values
all_features = pd.concat([
    pd.DataFrame(numerical_features),
    pd.DataFrame(region_encoded),
    pd.DataFrame(category_encoded)
], axis=1)

In [5]:
# Normalize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(all_features)

# Compute cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

In [6]:
# Create Lookalike Map
lookalike_map = {}

for idx, customer_id in enumerate(customer_profile['CustomerID']):
    # Get similarity scores for the current customer
    scores = list(enumerate(similarity_matrix[idx]))

    # Sort by similarity score (highest first) and exclude the customer itself
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    top_3 = scores[1:4]  # Top 3 similar customers

    # Map customer to their top 3 lookalikes
    lookalike_map[customer_id] = [
        (customer_profile['CustomerID'][i], score) for i, score in top_3
    ]

# Prepare Lookalike.csv for first 20 customers
lookalike_subset = {
    k: v for k, v in list(lookalike_map.items())[:20]
}

# Convert to DataFrame and save as CSV
lookalike_df = pd.DataFrame({
    "cust_id": lookalike_subset.keys(),
    "lookalikes": [v for v in lookalike_subset.values()]
})

lookalike_df.to_csv("Lookalike.csv", index=False)