## Lookalike Model

In [23]:
# Import necessary Libraries
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity #used to measure similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [24]:
# Load Dataset
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [25]:
# Merging Dataset
merged_data = transactions.merge(customers, on = "CustomerID").merge(products, on="ProductID")

In [26]:
merged_data

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,Jacob Holt,South America,2022-01-22,SoundWave Smartwatch,Electronics,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,Mrs. Kimberly Wright,North America,2024-04-07,SoundWave Smartwatch,Electronics,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,Tyler Haynes,North America,2024-09-21,SoundWave Smartwatch,Electronics,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,Joshua Hamilton,Asia,2024-11-11,SoundWave Smartwatch,Electronics,459.86


In [27]:
# Convert Date Columns
merged_data['TransactionDate'] = pd.to_datetime(merged_data['TransactionDate'])
merged_data['SignupDate'] = pd.to_datetime(merged_data['SignupDate'])

In [28]:
# Feature Engineering for Lookalike Model
customer_features = merged_data.groupby("CustomerID").agg(
    total_spent = ("TotalValue", "sum"),
    total_transactions = ("TransactionID", "count"),
    unique_categories = ("Category", "nunique"),
    signup_date = ("SignupDate", "first")
).reset_index()

customer_features['days_since_signup'] = (pd.Timestamp.now() - customer_features['signup_date']).dt.days

In [29]:
# Preprocessing >> preparing the numerical features of the customer_features DataFrame for machine learning or data analysis by scaling them to have a standard range
numerical_columns = ["total_spent", "total_transactions", "unique_categories", "days_since_signup"]
X = customer_features[numerical_columns]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [30]:
#calculate similarities
similarity_matrix = cosine_similarity(X_scaled)

In [31]:
#Generate Lookalike Recommendations
lookalike_results = {}
customer_ids = customer_features["CustomerID"].tolist()

for i, cust_id in enumerate(customer_ids[:20]): # process first 20 customers
    similarities = list(enumerate(similarity_matrix[i]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True) #sort by similarity score
    top_3 = [(customer_ids[j], float(round(score, 3))) for j, score in similarities if j != i][:3] #Top 3 excluding self
    lookalike_results[cust_id] = top_3
    

In [32]:
# Save lookalike Recommendations
lookalike_df = pd.DataFrame({"cust_id" : lookalike_results.keys(),
                             "lookalikes" : [str([(cust, score)for cust, score in val]) for val in lookalike_results.values()]})
lookalike_df.to_csv("Nares_Kharub_Lookalike.csv", index=False)
print("Lookalike recommendations saved to 'Naresh_Kharub_Lookslike.csv'.")

Lookalike recommendations saved to 'Naresh_Kharub_Lookslike.csv'.
