In [2]:
# importing required files
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [3]:
# reading csv files
transactions = pd.read_csv('Transactions.csv')
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')

In [4]:
# Step 1: Merge the datasets
merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')
merged_data = pd.merge(merged_data, products, on='ProductID', how='left')

# Step 2: Feature Engineering
# Add a new column for transaction day of the week
merged_data['TransactionDate'] = pd.to_datetime(merged_data['TransactionDate'])
merged_data['TransactionDay'] = merged_data['TransactionDate'].dt.day_name()

In [5]:
# Feature engineering
customer_features = merged_data.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    max_transaction_value=('TotalValue', 'max'),
    total_quantity=('Quantity', 'sum'),
    num_transactions=('TransactionID', 'count'),
    num_categories=('Category', 'nunique'),
    most_frequent_category=('Category', lambda x: x.mode()[0] if not x.mode().empty else 'None'),
    most_frequent_product=('ProductName', lambda x: x.mode()[0] if not x.mode().empty else 'None'),
    weekday_activity=('TransactionDay', lambda x: x.mode()[0] if not x.mode().empty else 'None'),
    month_with_max_purchases=('TransactionDate', lambda x: x.dt.month.mode()[0] if not x.empty else 'None')
).reset_index()


In [6]:
customer_features.head()

Unnamed: 0,CustomerID,total_spending,avg_transaction_value,max_transaction_value,total_quantity,num_transactions,num_categories,most_frequent_category,most_frequent_product,weekday_activity,month_with_max_purchases
0,C0001,3354.52,670.904,1300.92,12,5,3,Electronics,ActiveWear Smartwatch,Tuesday,1
1,C0002,1862.74,465.685,770.74,10,4,2,Clothing,BookWorld Cookware Set,Tuesday,2
2,C0003,2725.38,681.345,1385.2,14,4,3,Home Decor,ActiveWear Cookware Set,Sunday,6
3,C0004,5354.88,669.36,1099.76,23,8,3,Books,ActiveWear Cookware Set,Monday,12
4,C0005,2034.24,678.08,861.18,7,3,2,Electronics,ActiveWear Cookware Set,Friday,3


In [7]:
# Select only numerical features for normalization
numerical_features = ['total_spending', 'avg_transaction_value', 'max_transaction_value', 
                      'total_quantity', 'num_transactions', 'num_categories']

scaler = StandardScaler()
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])


In [8]:
# Compute the similarity matrix
features_for_similarity = customer_features[numerical_features]  # Use only numerical features
similarity_matrix = cosine_similarity(features_for_similarity)

In [9]:
# Map Customer IDs to similarity scores
customer_ids = customer_features['CustomerID'].values
lookalike_map = {}

for i, customer_id in enumerate(customer_ids[:20]):  # For the first 20 customers
    similarities = similarity_matrix[i]
    similar_customers = sorted(
        [(customer_ids[j], similarities[j]) for j in range(len(similarities)) if j != i],
        key=lambda x: x[1],
        reverse=True
    )
    lookalike_map[customer_id] = similar_customers[:3]  # Top 3 similar customers

In [10]:
# Step 4: Create DataFrame of recommendations
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_map.keys()),
    'Lookalikes': [str(lookalike_map[cid]) for cid in lookalike_map]
})

In [11]:
# Step 5: Save to Lookalike.csv
lookalike_df.to_csv("Mohit_Kumar_Lookalike.csv", index=False)
print("Lookalike.csv created successfully!")

Lookalike.csv created successfully!
