In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

In [9]:

class CustomerLookalikeModel:
    def __init__(self, customers_df, products_df, transactions_df):
        self.customers_df = customers_df
        self.products_df = products_df
        self.transactions_df = transactions_df
        self.feature_matrix = None
        self.similarity_matrix = None
        self.customer_indices = None

    def preprocess_data(self):
        # Convert dates to datetime format, handling errors gracefully
        self.customers_df['SignupDate'] = pd.to_datetime(self.customers_df['SignupDate'], errors='coerce', dayfirst=True)
        self.transactions_df['TransactionDate'] = pd.to_datetime(self.transactions_df['TransactionDate'], errors='coerce', dayfirst=False)

        # Drop rows with invalid or missing dates
        self.customers_df.dropna(subset=['SignupDate'], inplace=True)
        self.transactions_df.dropna(subset=['TransactionDate'], inplace=True)

        customer_features = self._create_customer_features()
        transaction_features = self._create_transaction_features()

        # Merge features into a single feature matrix
        self.feature_matrix = pd.merge(customer_features, transaction_features, on='CustomerID', how='left').fillna(0)

        # Store customer indices for later lookup
        self.customer_indices = {cust: idx for idx, cust in enumerate(self.feature_matrix['CustomerID'])}

        # Extract CustomerID column for output
        customer_ids = self.feature_matrix['CustomerID']
        self.feature_matrix = self.feature_matrix.drop('CustomerID', axis=1)

        # Scale features
        scaler = StandardScaler()
        self.feature_matrix = scaler.fit_transform(self.feature_matrix)

        # Compute similarity matrix
        self.similarity_matrix = cosine_similarity(self.feature_matrix)
        return customer_ids

    def _create_customer_features(self):
        # One-hot encode region information
        region_features = pd.get_dummies(self.customers_df['Region'], prefix='region')

        # Calculate account age in days
        current_date = pd.Timestamp.now()
        self.customers_df['account_age'] = (current_date - self.customers_df['SignupDate']).dt.days

        # Combine all customer features
        customer_features = pd.concat([
            self.customers_df[['CustomerID']],
            self.customers_df[['account_age']],
            region_features
        ], axis=1)

        return customer_features

    def _create_transaction_features(self):
        # Merge transactions with products to include product categories
        enriched_transactions = pd.merge(self.transactions_df, self.products_df, on='ProductID', how='inner')

        # Aggregate transaction data by CustomerID
        customer_metrics = enriched_transactions.groupby('CustomerID').agg({
            'TotalValue': ['sum', 'mean', 'count'],
            'Quantity': ['sum', 'mean'],
            'TransactionDate': ['min', 'max']
        }).reset_index()

        # Flatten column names
        customer_metrics.columns = ['CustomerID', 'total_spend', 'avg_transaction_value', 'transaction_count', 
                                     'total_quantity', 'avg_quantity', 'first_purchase', 'last_purchase']

        # Calculate days between first and last purchase
        customer_metrics['purchase_timespan'] = (customer_metrics['last_purchase'] - 
                                                 customer_metrics['first_purchase']).dt.days

        # Replace Timestamp columns with numeric equivalents
        customer_metrics['first_purchase'] = customer_metrics['first_purchase'].astype('int64') // 10**9
        customer_metrics['last_purchase'] = customer_metrics['last_purchase'].astype('int64') // 10**9

        # Calculate category preferences
        category_pivot = pd.crosstab(
            enriched_transactions['CustomerID'],
            enriched_transactions['Category'],
            values=enriched_transactions['TotalValue'],
            aggfunc='sum'
        ).fillna(0)

        # Normalize category preferences
        category_preferences = category_pivot.div(category_pivot.sum(axis=1), axis=0)

        # Combine transaction metrics and category preferences
        transaction_features = pd.merge(customer_metrics, category_preferences, on='CustomerID', how='left')

        return transaction_features

    def get_lookalikes(self, customer_id, n_recommendations=3):
        if customer_id not in self.customer_indices:
            return []

        customer_idx = self.customer_indices[customer_id]
        similarity_scores = self.similarity_matrix[customer_idx]
        similar_indices = np.argsort(similarity_scores)[::-1][1:n_recommendations+1]

        recommendations = []
        for idx in similar_indices:
            similar_customer = list(self.customer_indices.keys())[list(self.customer_indices.values()).index(idx)]
            score = similarity_scores[idx]
            recommendations.append((similar_customer, score))

        return recommendations

In [12]:
def create_lookalike_recommendations():
    # Read data
    customers_df = pd.read_csv('Customers.csv')
    products_df = pd.read_csv('Products.csv')
    transactions_df = pd.read_csv('Transactions.csv')

    # Initialize model
    model = CustomerLookalikeModel(customers_df, products_df, transactions_df)

    # Preprocess data
    customer_ids = model.preprocess_data()

    # Generate recommendations for the first 20 customers
    recommendations = {}
    for cust_id in customers_df['CustomerID'][:20]:
        lookalikes = model.get_lookalikes(cust_id)
        recommendations[cust_id] = lookalikes

    # Create output DataFrame
    output_rows = []
    for cust_id, lookalikes in recommendations.items():
        lookalike_str = ';'.join([f"{cust},{score:.4f}" for cust, score in lookalikes])
        output_rows.append({
            'CustomerID': cust_id,
            'Lookalikes': lookalike_str
        })

    output_df = pd.DataFrame(output_rows)

    # Save to CSV
    output_df.to_csv('Lookalike.csv', index=False)

    return output_df


In [11]:
recommendations_df = create_lookalike_recommendations()
print("\nSample of recommendations:")
print(recommendations_df.head())


Sample of recommendations:
  CustomerID                              Lookalikes
0      C0001  C0069,0.5724;C0163,0.4954;C0048,0.4790
1      C0003  C0163,0.6663;C0031,0.5740;C0087,0.4778
2      C0004  C0165,0.8785;C0104,0.7493;C0075,0.6884
3      C0006  C0187,0.6468;C0158,0.6429;C0048,0.5591
4      C0011  C0174,0.7040;C0107,0.6882;C0165,0.6047
