In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
transactions = pd.read_csv(r"C:\Users\psrid\Downloads\Transactions.csv")
products = pd.read_csv(r"C:\Users\psrid\Downloads\Products.csv")
customers = pd.read_csv(r"C:\Users\psrid\Downloads\Customers.csv")

# Merge datasets
data = pd.merge(transactions, products, on='ProductID', how='left')
data = pd.merge(data, customers, on='CustomerID', how='left')

# Feature engineering: Create customer profiles
customer_profiles = data.groupby('CustomerID').agg({
    'Category': lambda x: ' '.join(x),  # Concatenate categories
    'TotalValue': 'sum',  # Total spending
    'Region': 'first'  # Customer region
}).reset_index()

# Encode categorical features (Category, Region) for similarity calculation
category_dummies = customer_profiles['Category'].str.get_dummies(sep=' ')
region_dummies = pd.get_dummies(customer_profiles['Region'], prefix='Region', drop_first=True)

# Combine encoded features
encoded_profiles = pd.concat([customer_profiles[['CustomerID', 'TotalValue']], 
                              category_dummies, 
                              region_dummies], axis=1)

# Scale numerical features
scaler = StandardScaler()
encoded_profiles[['TotalValue']] = scaler.fit_transform(encoded_profiles[['TotalValue']])

# Check for non-numeric columns (if any) and convert them to numeric
features = encoded_profiles.drop(columns=['CustomerID'])

# Calculate similarity matrix
similarity_matrix = cosine_similarity(features)

# Extract top 3 similar customers for each target customer
lookalike_data = {}
for idx, customer_id in enumerate(encoded_profiles['CustomerID']):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Top 3 similar customers (excluding self)
    top_3 = [(encoded_profiles.iloc[i]['CustomerID'], score) for i, score in similarity_scores if i != idx][:3]
    lookalike_data[customer_id] = top_3

# Filter for first 20 customers (C0001 to C0020)
filtered_data = {k: v for k, v in lookalike_data.items() if k in customers['CustomerID'][:20].values}

# Create the output format for Lookalike.csv: Map<cust_id, List<cust_id, score>>
lookalike_map = [{
    'cust_id': cust_id,
    'lookalikes': str([(lookalike[0], round(lookalike[1], 4)) for lookalike in lookalikes])
} for cust_id, lookalikes in filtered_data.items()]

# Save the updated Lookalike.csv
lookalike_map_df = pd.DataFrame(lookalike_map)
lookalike_map_csv_path = r"C:\look\looklike.csv"
lookalike_map_df.to_csv(lookalike_map_csv_path, index=False)

print(f"Lookalike CSV saved at: {lookalike_map_csv_path}")


Lookalike CSV saved at: C:\look\looklike.csv
