In [7]:
# Task 2: Lookalike Model
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

def load_and_prepare_data():
    # Load data
    customers_df = pd.read_csv('Customers.csv')

    # Create transactions DataFrame from the provided sample
    transactions_data = {
        'TransactionID': ['T00001', 'T00112', 'T00166', 'T00272', 'T00363', 'T00442', 'T00490',
                         'T00536', 'T00564', 'T00631', 'T00727', 'T00729'],
        'CustomerID': ['C0199', 'C0146', 'C0127', 'C0087', 'C0070', 'C0188', 'C0195',
                      'C0008', 'C0157', 'C0130', 'C0051', 'C0075'],
        'ProductID': ['P067']*12,
        'TransactionDate': ['25-08-2024 12:38', '27-05-2024 22:23', '25-04-2024 07:38',
                           '26-03-2024 22:55', '21-03-2024 15:10', '26-12-2024 14:40',
                           '24-11-2024 11:49', '22-09-2024 06:13', '07-12-2024 17:57',
                           '14-05-2024 23:14', '20-01-2024 04:52', '07-10-2024 06:38'],
        'Quantity': [1, 1, 1, 2, 3, 1, 3, 1, 3, 2, 3, 2],
        'TotalValue': [300.68, 300.68, 300.68, 601.36, 902.04, 300.68, 902.04,
                      300.68, 902.04, 601.36, 902.04, 601.36],
        'Price': [300.68]*12
    }
    transactions_df = pd.DataFrame(transactions_data)

    return customers_df, transactions_df

def create_customer_features(customers_df, transactions_df):
    # Create customer transaction features
    transaction_features = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'Quantity': ['sum', 'mean'],
        'TotalValue': ['sum', 'mean']
    }).fillna(0)

    # Flatten column names
    transaction_features.columns = [
        'transaction_count',
        'total_quantity',
        'avg_quantity',
        'total_spend',
        'avg_spend'
    ]

    # Convert signup date to datetime if not already
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])

    # Create customer age feature (days since signup)
    customers_df['days_since_signup'] = (pd.Timestamp('2024-01-27') - customers_df['SignupDate']).dt.days

    # Create final feature matrix
    customer_features = customers_df.set_index('CustomerID')

    # Add transaction features
    customer_features = customer_features.join(transaction_features)

    # Fill missing values with 0 for transaction features
    customer_features = customer_features.fillna(0)

    # Convert region to dummy variables
    customer_features = pd.get_dummies(customer_features, columns=['Region'])

    return customer_features

def calculate_similarity(customer_features):
    # Select numerical features for similarity calculation
    numerical_features = customer_features.select_dtypes(include=['float64', 'int64']).columns

    # Scale the features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(customer_features[numerical_features])

    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(scaled_features)

    # Convert to DataFrame for easier lookup
    similarity_df = pd.DataFrame(
        similarity_matrix,
        index=customer_features.index,
        columns=customer_features.index
    )

    return similarity_df

def get_lookalikes(customer_id, similarity_df, n=3):
    """Get top n similar customers for a given customer ID"""
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:n+1]
    return [(cust_id, score) for cust_id, score in similar_customers.items()]

def main():
    # Load data
    print("Loading data...")
    customers_df, transactions_df = load_and_prepare_data()

    # Create customer features
    print("Creating customer features...")
    customer_features = create_customer_features(customers_df, transactions_df)

    # Calculate similarity matrix
    print("Calculating customer similarities...")
    similarity_df = calculate_similarity(customer_features)

    # Generate lookalikes for first 20 customers
    print("Generating lookalikes...")
    lookalike_data = []
    for cust_id in customers_df['CustomerID'][:20]:
        similar_customers = get_lookalikes(cust_id, similarity_df)
        row = {
            'customer_id': cust_id,
            'lookalike1_id': similar_customers[0][0],
            'lookalike1_score': similar_customers[0][1],
            'lookalike2_id': similar_customers[1][0],
            'lookalike2_score': similar_customers[1][1],
            'lookalike3_id': similar_customers[2][0],
            'lookalike3_score': similar_customers[2][1]
        }
        lookalike_data.append(row)

    # Save results to CSV
    print("Saving results...")
    pd.DataFrame(lookalike_data).to_csv('Jaffer_Ali_Lookalike.csv', index=False)
    print("Done! Results saved to Jaffer_Ali_Lookalike.csv")

if __name__ == "__main__":
    main()

Loading data...
Creating customer features...
Calculating customer similarities...
Generating lookalikes...
Saving results...
Done! Results saved to Jaffer_Ali_Lookalike.csv
