In [3]:
# Lookalike Model Implementation Using Merged Dataset with Categorical Feature

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load Data
merged_df = pd.read_csv('merged_ecommerce_data.csv')

# Step 2: Data Preparation
# Aggregate transaction data for each customer
customer_features = merged_df.groupby('CustomerID').agg(
    TotalSpend=('TotalValue', 'sum'),
    TransactionCount=('TransactionID', 'count'),
    AverageTransactionValue=('TotalValue', 'mean'),
    MostFrequentCategory=('Category', lambda x: x.mode()[0]),  # Get most frequent category
).reset_index()

# Step 3: One-Hot Encode MostFrequentCategory
category_dummies = pd.get_dummies(customer_features['MostFrequentCategory'], prefix='Category')
customer_features = pd.concat([customer_features.drop('MostFrequentCategory', axis=1), category_dummies], axis=1)

# Step 4: Normalize Numerical Features
# Select only numerical features for scaling
numerical_features = customer_features[['TotalSpend', 'TransactionCount', 'AverageTransactionValue']]
scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(numerical_features)

# Combine scaled numerical features with one-hot encoded categorical features
final_features = pd.DataFrame(scaled_numerical_features, columns=numerical_features.columns)
final_features = pd.concat([customer_features.drop(['TotalSpend', 'TransactionCount', 'AverageTransactionValue'], axis=1), final_features], axis=1)

# Step 5: Calculate Similarity Scores
similarity_matrix = cosine_similarity(final_features.drop('CustomerID', axis=1))

# Step 6: Generate Recommendations for First 20 Customers
lookalike_map = {}
for i in range(20):  # For CustomerID C0001 to C0020 (index 0 to 19)
    similar_indices = similarity_matrix[i].argsort()[-4:-1][::-1]  # Get top 3 similar customers (excluding self)
    similar_customers = customer_features.iloc[similar_indices]['CustomerID'].values
    scores = similarity_matrix[i][similar_indices]
    
    lookalike_map[customer_features['CustomerID'].iloc[i]] = list(zip(similar_customers, scores))

# Step 7: Create DataFrame for Output
lookalike_df = pd.DataFrame([(cust_id, list(map(list, lookalikes))) for cust_id, lookalikes in lookalike_map.items()],
                            columns=['CustomerID', 'Lookalikes'])

# Save results to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike model completed and results saved to Lookalike.csv.")


Lookalike model completed and results saved to Lookalike.csv.
