In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import datetime

# Load datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

def create_customer_features(customers_df, transactions_df, products_df):
    # Create a copy to avoid modifying original dataframes
    customers_df = customers_df.copy()
    transactions_df = transactions_df.copy()
    
    # Calculate days since signup
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    current_date = customers_df['SignupDate'].max()
    customers_df['days_since_signup'] = (current_date - customers_df['SignupDate']).dt.days
    
    # Region encoding
    customers_df = pd.get_dummies(customers_df, columns=['Region'], prefix='region')
    
    # Transaction-based features
    customer_stats = transactions_df.groupby('CustomerID').agg({
        'TotalValue': ['sum', 'mean', 'std'],
        'Quantity': ['sum', 'mean', 'std'],
        'TransactionID': 'count'
    })
    
    # Replace NaN values in std columns with 0
    customer_stats = customer_stats.fillna(0)
    
    # Flatten column names
    customer_stats.columns = ['total_spend', 'avg_transaction_value', 'std_transaction_value',
                            'total_quantity', 'avg_quantity', 'std_quantity',
                            'transaction_count']
    
    # Product category preferences
    product_categories = transactions_df.merge(products_df[['ProductID', 'Category']], on='ProductID')
    category_pivot = pd.crosstab(product_categories['CustomerID'], 
                                product_categories['Category'],
                                values=product_categories['Quantity'],
                                aggfunc='sum').fillna(0)
    
    # Handle division by zero
    row_sums = category_pivot.sum(axis=1)
    category_pivot = category_pivot.div(row_sums, axis=0).fillna(0)
    
    # Merge all features
    feature_df = customers_df.set_index('CustomerID')
    feature_df = feature_df.join(customer_stats).join(category_pivot)
    
    # Fill any remaining NaN values with 0
    feature_df = feature_df.fillna(0)
    
    # Additional advanced features
    feature_df['avg_spend_per_day'] = feature_df['total_spend'] / feature_df['days_since_signup'].clip(1)
    feature_df['transaction_frequency'] = feature_df['transaction_count'] / feature_df['days_since_signup'].clip(1)
    
    return feature_df

def find_lookalikes(feature_df, customer_id, n_recommendations=3):
    # Select features for similarity calculation
    exclude_cols = ['CustomerName', 'SignupDate']
    feature_cols = [col for col in feature_df.columns if col not in exclude_cols]
    
    # Standardize features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(feature_df[feature_cols])
    
    # Calculate similarity scores
    customer_index = feature_df.index.get_loc(customer_id)
    similarity_scores = cosine_similarity([features_scaled[customer_index]], features_scaled)[0]
    
    # Get top N similar customers (excluding self)
    similar_indices = np.argsort(similarity_scores)[::-1][1:n_recommendations+1]
    
    recommendations = []
    for idx in similar_indices:
        recommendations.append({
            'customer_id': feature_df.index[idx],
            'similarity_score': similarity_scores[idx]
        })
    
    return recommendations

def generate_all_recommendations(customers_df, transactions_df, products_df, target_customers):
    # Create feature matrix
    feature_df = create_customer_features(customers_df, transactions_df, products_df)
    
    # Generate recommendations for each target customer
    recommendations = {}
    for customer_id in target_customers:
        lookalikes = find_lookalikes(feature_df, customer_id)
        recommendations[customer_id] = [
            (rec['customer_id'], round(rec['similarity_score'], 4))
            for rec in lookalikes
        ]
    
    return recommendations

# Generate recommendations for first 20 customers
target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)]
recommendations = generate_all_recommendations(customers_df, transactions_df, products_df, target_customers)

# Create output DataFrame and save to CSV
output_df = pd.DataFrame([(k, str(v)) for k, v in recommendations.items()],
                        columns=['CustomerID', 'Recommendations'])
print("\nFirst few recommendations:")
display(output_df.head())

# Save results
output_df.to_csv('Jayaprakash_Dirisala_Lookalike.csv', index=False)


First few recommendations:


Unnamed: 0,CustomerID,Recommendations
0,C0001,"[('C0120', 0.7011), ('C0130', 0.6982), ('C0184..."
1,C0002,"[('C0134', 0.8141), ('C0106', 0.8083), ('C0159..."
2,C0003,"[('C0129', 0.7562), ('C0195', 0.7356), ('C0091..."
3,C0004,"[('C0113', 0.9017), ('C0102', 0.8505), ('C0104..."
4,C0005,"[('C0007', 0.7771), ('C0128', 0.6402), ('C0080..."
