# Lookalike Model

In [15]:
#import libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#Load Data
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
#Merge the Data
merged = transactions.merge(customers, on="CustomerID", how="left")
merged = merged.merge(products, on="ProductID", how="left")
merged.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [4]:
# Feature Engineering
customer_features = merged.groupby("CustomerID").agg(
    total_spend=("Price_x", "sum"),
    avg_spend=("Price_x", "mean"),
    num_transactions=("TransactionID", "count")
).reset_index() # Aggregate transaction data for each customer

In [5]:
# Create one-hot encoding
product_categories = pd.get_dummies(merged[["CustomerID", "Category"]], columns=["Category"])
product_categories = product_categories.groupby("CustomerID").sum().reset_index()

In [6]:
# Merge
customer_profiles = customer_features.merge(product_categories, on="CustomerID", how="left")
customer_profiles = customer_profiles.merge(customers[["CustomerID", "Region"]], on="CustomerID", how="left")

In [7]:
# One-hot encode the 'Region' column
customer_profiles = pd.get_dummies(customer_profiles, columns=["Region"], drop_first=True)

In [8]:
# Normalize Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profiles.drop(columns=["CustomerID"]))

In [9]:
# Compute Similarity
similarity_matrix = cosine_similarity(scaled_features)

In [10]:
# Function to Recommend Similar Customers
def recommend_similar(customers_df, similarity_matrix, target_customer_id, num_recommendations=3):
    # Get index of target customer
    target_idx = customers_df[customers_df["CustomerID"] == target_customer_id].index[0]
    
    # get similarity scores for the target customer
    similarity_scores = similarity_matrix[target_idx]
    
    # Rank customers by similarity score (excluding the target customer itself)
    similar_customers = sorted(
        [(idx, score) for idx, score in enumerate(similarity_scores) if idx != target_idx],
        key=lambda x: x[1],
        reverse=True
    )
    
        
    # Fetch top N similar customers
    recommendations = [
        (customers_df.iloc[idx]["CustomerID"], score) for idx, score in similar_customers[:num_recommendations]
    ]
    
    return recommendations



In [11]:
# Generate Lookalike Mapping for First 20 Customers
lookalike_map = {}

for target_customer_id in customers["CustomerID"][:20]:  # First 20 customers
    recommendations = recommend_similar(customer_profiles, similarity_matrix, target_customer_id, num_recommendations=3)
    lookalike_map[target_customer_id] = recommendations


In [12]:
# Convert Lookalike Map to DataFrame
lookalike_data = []
for cust_id, recs in lookalike_map.items():
    for rec_cust_id, score in recs:
        lookalike_data.append({"cust_id": cust_id, "rec_cust_id": rec_cust_id, "score": round(score, 4)})

lookalike_df = pd.DataFrame(lookalike_data)
display(lookalike_df)

Unnamed: 0,cust_id,rec_cust_id,score
0,C0001,C0148,0.8534
1,C0001,C0120,0.8461
2,C0001,C0190,0.7883
3,C0002,C0134,0.9366
4,C0002,C0159,0.897
5,C0002,C0106,0.8866
6,C0003,C0031,0.9766
7,C0003,C0025,0.8892
8,C0003,C0129,0.8591
9,C0004,C0113,0.9242


In [13]:
# Save Lookalike Data to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)


In [14]:
# Print Summary
print("Lookalike mapping generated for first 20 customers and saved to 'Lookalike.csv'.")

Lookalike mapping generated for first 20 customers and saved to 'Lookalike.csv'.
