In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import json

# Read the data
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')

def create_customer_profile():
    """Create comprehensive customer profiles using both customer and transaction data"""
    
    # 1. Basic customer features from customer data
    customer_profile = customers_df.copy()
    customer_profile['SignupDate'] = pd.to_datetime(customer_profile['SignupDate'])
    customer_profile['account_age_days'] = (pd.Timestamp.now() - customer_profile['SignupDate']).dt.days
    
    # 2. Transaction-based features
    transaction_features = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',  # number of transactions
        'TotalValue': ['sum', 'mean', 'std'],  # spending patterns
        'Quantity': ['sum', 'mean', 'std'],  # purchase quantity patterns
    }).reset_index()
    
    # Flatten column names
    transaction_features.columns = [
        'CustomerID', 'transaction_count', 'total_spend', 'avg_transaction_value',
        'std_transaction_value', 'total_items', 'avg_items_per_transaction', 'std_items'
    ]
    
    # 3. Product category preferences
    # Merge transactions with products to get categories
    trans_with_categories = transactions_df.merge(
        products_df[['ProductID', 'Category']], 
        on='ProductID'
    )
    
    # Calculate category preferences using pivot_table
    category_pivot = pd.pivot_table(
        trans_with_categories,
        index='CustomerID',
        columns='Category',
        values='Quantity',
        aggfunc='sum',
        fill_value=0
    )
    
    # Normalize category preferences
    category_sums = category_pivot.sum(axis=1)
    category_pivot = category_pivot.div(category_sums, axis=0).fillna(0)
    
    # 4. Recent behavior features
    trans_with_categories['TransactionDate'] = pd.to_datetime(trans_with_categories['TransactionDate'])
    recent_transactions = trans_with_categories.sort_values('TransactionDate').groupby('CustomerID').tail(5)
    
    recent_features = recent_transactions.groupby('CustomerID').agg({
        'TotalValue': 'mean',
        'Quantity': 'mean'
    }).reset_index()
    
    recent_features.columns = ['CustomerID', 'recent_avg_value', 'recent_avg_quantity']
    
    # Add average transaction value per item for recent transactions
    recent_features['recent_avg_price_per_item'] = recent_features['recent_avg_value'] / recent_features['recent_avg_quantity']
    
    # 5. Merge all features
    customer_profile = customer_profile.merge(transaction_features, on='CustomerID', how='left')
    customer_profile = customer_profile.merge(category_pivot.reset_index(), on='CustomerID', how='left')
    customer_profile = customer_profile.merge(recent_features, on='CustomerID', how='left')
    
    # Fill NaN values for customers with no transactions
    customer_profile = customer_profile.fillna(0)
    
    return customer_profile

def find_lookalikes(customer_profile, target_customers, n_recommendations=3):
    """Find lookalike customers using cosine similarity"""
    
    # Select features for similarity calculation
    feature_cols = customer_profile.select_dtypes(include=['float64', 'int64']).columns
    feature_cols = feature_cols.drop(['CustomerID']) if 'CustomerID' in feature_cols else feature_cols
    
    # Normalize features
    scaler = StandardScaler()
    normalized_features = scaler.fit_transform(customer_profile[feature_cols])
    
    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(normalized_features)
    
    # Generate recommendations for target customers
    lookalike_results = {}
    customer_ids = customer_profile['CustomerID'].values
    
    for target_id in target_customers:
        target_idx = customer_profile[customer_profile['CustomerID'] == target_id].index[0]
        similarities = similarity_matrix[target_idx]
        
        # Get top N similar customers (excluding self)
        similar_indices = np.argsort(similarities)[::-1][1:n_recommendations+1]
        
        similar_customers = [
            {
                'customer_id': str(customer_ids[idx]),
                'similarity_score': float(similarities[idx])
            }
            for idx in similar_indices
        ]
        
        lookalike_results[str(target_id)] = similar_customers
    
    return lookalike_results

# Create customer profiles
print("Creating customer profiles...")
customer_profile = create_customer_profile()

# Get first 20 customers
target_customers = customers_df['CustomerID'].iloc[:20].tolist()

# Generate lookalike recommendations
print("Generating lookalike recommendations...")
lookalike_results = find_lookalikes(customer_profile, target_customers)

# Save results to CSV
print("Saving results...")
with open('FirstName_LastName_Lookalike.csv', 'w') as f:
    json.dump(lookalike_results, f, indent=2)

# Print sample results
print("\nSample lookalike recommendations:")
for target_id in list(lookalike_results.keys())[:3]:
    print(f"\nTarget Customer: {target_id}")
    for rec in lookalike_results[target_id]:
        print(f"Similar Customer: {rec['customer_id']}, Similarity Score: {rec['similarity_score']:.4f}")

Creating customer profiles...
Generating lookalike recommendations...
Saving results...

Sample lookalike recommendations:

Target Customer: C0001
Similar Customer: C0005, Similarity Score: 0.7979
Similar Customer: C0069, Similarity Score: 0.7179
Similar Customer: C0130, Similarity Score: 0.6831

Target Customer: C0002
Similar Customer: C0060, Similarity Score: 0.7916
Similar Customer: C0062, Similarity Score: 0.7798
Similar Customer: C0025, Similarity Score: 0.7519

Target Customer: C0003
Similar Customer: C0144, Similarity Score: 0.9027
Similar Customer: C0091, Similarity Score: 0.7028
Similar Customer: C0151, Similarity Score: 0.6788
