In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Convert date columns to datetime for analysis
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

# Merging datasets for comprehensive analysis
merged_df = transactions_df.merge(customers_df, on='CustomerID', how='left') \
                           .merge(products_df, on='ProductID', how='left')

# Preprocessing: Aggregate transaction data per customer
customer_transactions = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price_x': 'mean'
}).reset_index()

# Merge with customer demographic data
customer_profile = customers_df.merge(customer_transactions, on='CustomerID', how='left').fillna(0)

# Encode categorical data (Region)
customer_profile = pd.get_dummies(customer_profile, columns=['Region'], drop_first=True)

# Scaling the numerical features
scaler = StandardScaler()
features = customer_profile.drop(['CustomerID', 'CustomerName', 'SignupDate'], axis=1)
scaled_features = scaler.fit_transform(features)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Create Lookalike Data
lookalike_data = {}

customer_ids = customer_profile['CustomerID'].values

for idx, cust_id in enumerate(customer_ids[:20]):  # For customers C0001 - C0020
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    # Exclude the customer themselves and sort by similarity score
    similar_customers = sorted(
        [(customer_ids[i], score) for i, score in similarity_scores if customer_ids[i] != cust_id],
        key=lambda x: x[1], reverse=True
    )[:3]  # Top 3 similar customers
    
    lookalike_data[cust_id] = similar_customers

# Convert to DataFrame for CSV export
lookalike_list = [{'CustomerID': cust_id, 
                   'Lookalike_1': sims[0][0], 'Score_1': sims[0][1],
                   'Lookalike_2': sims[1][0], 'Score_2': sims[1][1],
                   'Lookalike_3': sims[2][0], 'Score_3': sims[2][1]}
                  for cust_id, sims in lookalike_data.items()]

lookalike_df = pd.DataFrame(lookalike_list)

# Save to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the Lookalike Data
print(lookalike_df.head())


  CustomerID Lookalike_1   Score_1 Lookalike_2   Score_2 Lookalike_3   Score_3
0      C0001       C0137  0.992641       C0191  0.989305       C0011  0.982790
1      C0002       C0088  0.984706       C0142  0.978391       C0043  0.962011
2      C0003       C0190  0.986369       C0147  0.971738       C0174  0.960391
3      C0004       C0113  0.987303       C0165  0.968836       C0012  0.964305
4      C0005       C0140  0.989781       C0186  0.977285       C0123  0.977277
