# Task 2: Lookalike Model

Importing necessary libraries

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from datetime import datetime


Loading data sets and Initial Inspections

In [10]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Display the first few rows of each dataset for inspection
customers.head(), products.head(), transactions.head()

(  CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067  2024-04-25 07:38:55    

# Creating Features for evaluating data

Data Merging and Preprocessing

In [11]:
# Convert dates to datetime for better analysis
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [12]:
# Merge transaction data with product and customer data for a unified dataset
transactions = transactions.merge(products, on='ProductID', how='left')
transactions = transactions.merge(customers, on='CustomerID', how='left')
# Seeing for columns overlapping
print(transactions.columns)

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'ProductName', 'Category',
       'Price_y', 'CustomerName', 'Region', 'SignupDate'],
      dtype='object')


Fixing Overlapping

In [13]:
# Drop 'Price' and 'Price_y' columns
transactions = transactions.drop(columns=['Price_y'])

# Rename 'Price_x' to 'Price'
transactions = transactions.rename(columns={'Price_x': 'Price'})

Feature Aggregation

In [14]:
# Step 1: Feature Aggregation
customer_features = transactions.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    average_price=('Price', 'mean'),
    unique_categories=('Category', lambda x: len(x.unique()))
).reset_index()

# Merge customer features with demographic information (e.g., Region)
customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on='CustomerID', how='left')


Step 2 : Encoding Categorical Data
The region column is One-Hot Encodinf using pd.get_dummies This converts the categorical region information into numerical columns, making it compatible with the similarity computation.

Step 3 : Normalization
Customer behavior and demographic features vary significantly in scale (e.g., total spending vs. average price). To address this, the features are normalized using StandardScaler. This scales all numerical values to a standard range, ensuring that no single feature dominates the similarity computation.

Step 4 : Cosine Similarity
Cosine Similarity
The cosine similarity metric is used to measure the similarity between customers. Why Cosine Similarity? It computes the cosine of the angle between two feature vectors, making it suitable for high-dimensional data where magnitudes (e.g., spending amounts) should not bias the similarity score.

In [15]:
# Step 2: Encoding Categorical Data
encoded_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

# Step 3: Normalization
scaler = StandardScaler()
normalized_features = scaler.fit_transform(encoded_features.drop(columns=['CustomerID']))

# Step 4: Cosine Similarity
similarity_matrix = cosine_similarity(normalized_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Step 5: Find Top 3 Lookalikes for First 20 Customers
lookalike_results = {}
for customer_id in customer_features['CustomerID'][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Excluding the customer itself
    lookalike_results[customer_id] = [
        (similar_customer, round(score, 4)) for similar_customer, score in similar_customers.items()
    ]

# Step 6: Prepare Results for Output
lookalike_df = pd.DataFrame([
    {"CustomerID": cust_id, "Lookalikes": str(lookalikes)}
    for cust_id, lookalikes in lookalike_results.items()
])
# Save to CSV
lookalike_df.to_csv('Ishan_Ayush_Lookalike.csv', index=False)
