In [57]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv("/kaggle/input/zeotap/Customers.csv")
products = pd.read_csv("/kaggle/input/zeotap/Products.csv")
transactions = pd.read_csv("/kaggle/input/zeotap/Transactions.csv")

# Merge datasets
data = pd.merge(transactions, products, on="ProductID", how="left")
data = pd.merge(data, customers, on="CustomerID", how="left")

# Aggregate customer data (profile and transaction history)
customer_profiles = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price_x': 'mean',
    'Region': 'first',
    'Category': lambda x: ' '.join(x)
}).reset_index()

# Encode categorical data (Region and Category)
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region', 'Category'])

# Scale numerical features
scaler = StandardScaler()
numerical_features = ['TotalValue', 'Quantity', 'Price_x']
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

# Compute similarity matrix
customer_ids = customer_profiles['CustomerID']
features = customer_profiles.drop(columns=['CustomerID'])
similarity_matrix = cosine_similarity(features)

# Generate lookalike recommendations for the first 20 customers
lookalike_results = {}
for idx, customer_id in enumerate(customer_ids[:20]):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 similar customers
    lookalike_results[customer_id] = [(customer_ids[i], score) for i, score in similar_customers]

# Save results to Lookalike.csv
lookalike_df = pd.DataFrame([
    {'cust_id': key, 'lookalikes': value} for key, value in lookalike_results.items()
])
lookalike_df['lookalikes'] = lookalike_df['lookalikes'].apply(
    lambda x: [f"({cust_id}, {round(score, 2)})" for cust_id, score in x]
)
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the first few recommendations
print(lookalike_df)


   cust_id                                     lookalikes
0    C0001   [(C0137, 0.51), (C0191, 0.51), (C0152, 0.5)]
1    C0002  [(C0043, 0.76), (C0097, 0.75), (C0142, 0.72)]
2    C0003  [(C0091, 0.69), (C0190, 0.65), (C0151, 0.64)]
3    C0004  [(C0165, 0.87), (C0113, 0.81), (C0087, 0.77)]
4    C0005  [(C0128, 0.76), (C0080, 0.75), (C0123, 0.75)]
5    C0006    [(C0168, 0.7), (C0187, 0.7), (C0048, 0.67)]
6    C0007  [(C0078, 0.79), (C0115, 0.76), (C0146, 0.74)]
7    C0008  [(C0109, 0.78), (C0068, 0.71), (C0018, 0.69)]
8    C0009   [(C0198, 0.8), (C0061, 0.77), (C0130, 0.72)]
9    C0010  [(C0121, 0.84), (C0060, 0.81), (C0111, 0.81)]
10   C0011  [(C0107, 0.53), (C0006, 0.53), (C0126, 0.52)]
11   C0012  [(C0087, 0.78), (C0102, 0.77), (C0004, 0.76)]
12   C0013  [(C0082, 0.84), (C0188, 0.83), (C0104, 0.83)]
13   C0014  [(C0060, 0.95), (C0097, 0.91), (C0151, 0.86)]
14   C0015  [(C0058, 0.88), (C0020, 0.82), (C0042, 0.78)]
15   C0016  [(C0050, 0.79), (C0185, 0.79), (C0042, 0.77)]
16   C0017  [(