In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")


In [4]:
# Aggregate transaction data by customer
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',         # Total spending
    'Quantity': 'sum',           # Total quantity purchased
    'ProductID': 'nunique',      # Number of unique products purchased
    'Category': 'nunique'        # Number of unique categories purchased
}).reset_index()

# Add profile features (e.g., region)
customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on='CustomerID')

# One-hot encode categorical features (e.g., Region)
encoder = OneHotEncoder()
region_encoded = pd.DataFrame(
    encoder.fit_transform(customer_features[['Region']]).toarray(),
    columns=encoder.get_feature_names_out(['Region'])
)
customer_features = pd.concat([customer_features, region_encoded], axis=1)
customer_features.drop(columns=['Region'], inplace=True)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['TotalValue', 'Quantity', 'ProductID', 'Category']
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])


In [5]:
# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(customer_features.drop(columns=['CustomerID']))

# Create a DataFrame for similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


In [6]:
# Generate lookalike recommendations for customers C0001 to C0020
lookalike_results = {}

for customer in customer_features['CustomerID'][:20]:
    # Get similarity scores for the customer
    customer_similarities = similarity_df[customer].sort_values(ascending=False)
    
    # Exclude the customer themselves
    top_similar_customers = customer_similarities.iloc[1:4]  # Top 3 (excluding self)
    
    # Add to results
    lookalike_results[customer] = list(zip(top_similar_customers.index, top_similar_customers.values))

# Create Lookalike.csv
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])
lookalike_df.to_csv('Lookalike.csv', index_label='CustomerID')
