In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load data
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Create customer-product matrix
customer_categories = (
    transactions.merge(products[['ProductID', 'Category']], on='ProductID')
    .groupby(['CustomerID', 'Category'])['TransactionID']
    .count()
    .unstack(fill_value=0)
    .reset_index()
)

# Process customer features
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['SignupRecency'] = (customers['SignupDate'].max() - customers['SignupDate']).dt.days
customers = pd.get_dummies(customers, columns=['Region'])

# Merge features
feature_df = customers.merge(customer_categories, on='CustomerID', how='left').fillna(0)

# Prepare feature matrix
feature_cols = [
    'SignupRecency',
    *[c for c in customers.columns if c.startswith('Region_')],
    *customer_categories.columns[1:]
]

scaler = StandardScaler()
scaled_features = scaler.fit_transform(feature_df[feature_cols])

# Calculate similarity
cosine_sim = cosine_similarity(scaled_features)

# Generate recommendations
lookalike_mapping = {}
target_ids = [f'C{i:04d}' for i in range(1, 21)]

for cust_id in target_ids:
    try:
        idx = feature_df[feature_df['CustomerID'] == cust_id].index[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:4]  # Skip self

        lookalike_mapping[cust_id] = [
            (feature_df.iloc[i]['CustomerID'], round(score, 4))
            for i, score in sorted_scores
        ]
    except IndexError:
        lookalike_mapping[cust_id] = []

# Create output
output = []
for cust_id in target_ids:
    entry = {
        'CustomerID': cust_id,
        'Lookalikes': lookalike_mapping.get(cust_id, [])
    }
    output.append(entry)

# Save as Lookalike.csv
lookalike_df = pd.DataFrame([
    {
        'CustomerID': row['CustomerID'],
        f'Lookalike{i+1}': rec[0],
        f'Score{i+1}': rec[1]
    }
    for row in output
    for i, rec in enumerate(row['Lookalikes'])
], columns=['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3'])

lookalike_df.groupby('CustomerID').first().reset_index().to_csv('Jaswanth_Kumar_Malla_Lookalike.csv', index=False)
x =lookalike_df.groupby('CustomerID').first().reset_index()
print(x)

   CustomerID Lookalike1  Score1 Lookalike2  Score2 Lookalike3  Score3
0       C0001      C0112  0.8989      C0120  0.8961      C0096  0.8888
1       C0002      C0134  0.9696      C0159  0.9488      C0106  0.9444
2       C0003      C0031  0.9991      C0129  0.9286      C0158  0.9088
3       C0004      C0113  0.9107      C0104  0.8369      C0174  0.7818
4       C0005      C0007  0.9985      C0140  0.8908      C0186  0.7960
5       C0006      C0187  0.9067      C0137  0.8532      C0077  0.7956
6       C0007      C0005  0.9985      C0140  0.8719      C0045  0.7886
7       C0008      C0059  0.9372      C0098  0.7774      C0154  0.7729
8       C0009      C0061  0.8744      C0149  0.8596      C0062  0.8583
9       C0010      C0061  0.9590      C0062  0.8691      C0009  0.8363
10      C0011      C0153  0.9061      C0171  0.8941      C0174  0.8873
11      C0012      C0052  0.8851      C0076  0.8459      C0187  0.7793
12      C0013      C0107  0.8140      C0190  0.7415      C0048  0.7209
13    