In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load customer and transaction data
customers = pd.read_csv('Customers.csv')  # Assuming this file contains customer demographics
transactions = pd.read_csv('Transactions.csv')  # Assuming this file contains transaction data

# Preprocess customer data (encoding, scaling)
customers_encoded = pd.get_dummies(customers[['Gender', 'Location']])  # Example of categorical encoding
customers_data = pd.concat([customers[['Age']], customers_encoded], axis=1)

# Preprocess transaction data
# Aggregate transactions by customer
transaction_summary = transactions.groupby('CustomerID').agg({
    'Amount': 'sum',
    'ProductID': 'nunique',  # Number of unique products bought
}).reset_index()

# Merge customer data with transaction data
customer_profiles = pd.merge(customers_data, transaction_summary, on='CustomerID')

# Standardize data (scaling)
scaler = StandardScaler()
scaled_profiles = scaler.fit_transform(customer_profiles.drop('CustomerID', axis=1))

# Use PCA for dimensionality reduction (optional)
pca = PCA(n_components=5)
reduced_profiles = pca.fit_transform(scaled_profiles)

# Calculate cosine similarity between customers
similarities = cosine_similarity(reduced_profiles)

# Get top 3 lookalikes for each customer (CustomerID: C0001 - C0020)
top_lookalikes = {}
for idx in range(20):  # Customer IDs C0001 - C0020 (indices 0-19)
    similarities_idx = similarities[idx]
    # Exclude the similarity of a customer to itself (similarity == 1)
    similar_customers = [(i, similarities_idx[i]) for i in range(len(similarities_idx)) if i != idx]
    # Sort based on similarity score and take top 3
    similar_customers.sort(key=lambda x: x[1], reverse=True)
    top_lookalikes[customers.iloc[idx]['CustomerID']] = [
        {'CustomerID': customers.iloc[i[0]]['CustomerID'], 'Score': i[1]} for i in similar_customers[:3]
    ]

# Create Lookalike.csv
lookalike_data = []
for cust_id, lookalikes in top_lookalikes.items():
    for lookalike in lookalikes:
        lookalike_data.append([cust_id, lookalike['CustomerID'], lookalike['Score']])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeID', 'Score'])
lookalike_df.to_csv('Lookalike.csv', index=False)
