In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

In [12]:
def load_and_preprocess_data(customers_df, products_df, transactions_df):
    # Process transaction data
    # Calculate key metrics per customer
    customer_metrics = transactions_df.groupby('CustomerID').agg({
        'TotalValue': ['sum', 'mean'],
        'Quantity': ['sum', 'mean'],
        'ProductID': 'nunique'
    }).reset_index()
    
    # Flatten column names
    customer_metrics.columns = ['CustomerID', 'total_spend', 'avg_transaction_value', 
                              'total_quantity', 'avg_quantity', 'unique_products']
    
    # Calculate product category preferences
    product_categories = transactions_df.merge(products_df[['ProductID', 'Category']], on='ProductID')
    category_preferences = pd.crosstab(product_categories['CustomerID'], 
                                     product_categories['Category'], 
                                     values=product_categories['Quantity'], 
                                     aggfunc='sum', 
                                     normalize='index').fillna(0)
    
    # Combine customer data with metrics
    customer_features = customers_df.merge(customer_metrics, on='CustomerID', how='left')
    customer_features = customer_features.merge(category_preferences, 
                                              left_on='CustomerID', 
                                              right_index=True, 
                                              how='left')
    
    # Fill NaN values with 0 for customers without transactions
    customer_features = customer_features.fillna(0)
    
    # Convert Region to dummy variables
    region_dummies = pd.get_dummies(customer_features['Region'], prefix='region')
    customer_features = pd.concat([customer_features, region_dummies], axis=1)
    
    return customer_features


In [13]:
def calculate_similarity_scores(customer_features, target_customer_id):
    # Select features for similarity calculation
    feature_columns = ['total_spend', 'avg_transaction_value', 'total_quantity', 
                      'avg_quantity', 'unique_products', 'Books', 'Clothing', 
                      'Electronics', 'Home Decor', 'region_Asia', 'region_Europe', 
                      'region_North America', 'region_South America']
    
    # Normalize features
    scaler = StandardScaler()
    features_normalized = scaler.fit_transform(customer_features[feature_columns])
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(features_normalized)
    
    # Get index of target customer
    target_idx = customer_features.index[customer_features['CustomerID'] == target_customer_id].item()
    
    # Get similarity scores for target customer
    similarity_scores = similarity_matrix[target_idx]
    
    # Create DataFrame with customer IDs and similarity scores
    similarities = pd.DataFrame({
        'CustomerID': customer_features['CustomerID'],
        'similarity_score': similarity_scores
    })
    
    # Remove self-similarity and sort by score
    similarities = similarities[similarities['CustomerID'] != target_customer_id]
    similarities = similarities.sort_values('similarity_score', ascending=False)
    
    return similarities

In [14]:
def get_top_lookalikes(customers_df, products_df, transactions_df, target_customers, n_recommendations=3):
    # Preprocess data
    customer_features = load_and_preprocess_data(customers_df, products_df, transactions_df)
    
    # Store results
    recommendations = {}
    
    # Get recommendations for each target customer
    for customer_id in target_customers:
        similarities = calculate_similarity_scores(customer_features, customer_id)
        top_recommendations = similarities.head(n_recommendations)
        
        # Format recommendations as list of tuples (customer_id, score)
        recommendations[customer_id] = [
            (row['CustomerID'], round(row['similarity_score'], 3))
            for _, row in top_recommendations.iterrows()
        ]
    
    return recommendations

In [15]:
# Read data
customers_df = pd.read_csv('../data/Customers.csv')
products_df = pd.read_csv('../data/Products.csv')
#transactions_df = pd.read_csv('../data/Transactions.csv')

# Get target customers (C0001-C0020)
target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)]

# Get recommendations
recommendations = get_top_lookalikes(customers_df, products_df, transactions_df, target_customers)

# Format results for CSV
results = []
for customer_id, lookalikes in recommendations.items():
    lookalike_str = ','.join([f"{cid},{score}" for cid, score in lookalikes])
    results.append(f"{customer_id},{lookalike_str}")

# Write to CSV
with open('Lookalike.csv', 'w') as f:
    f.write("CustomerID,Lookalike1_ID,Score1,Lookalike2_ID,Score2,Lookalike3_ID,Score3\n")
    for result in results:
        f.write(f"{result}\n")