In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [2]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
# Display the first few rows of each dataset
print(customers.head())
print(products.head())
print(transactions.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [4]:
# Merge Transactions with Products to include product details
transactions_enriched = transactions.merge(products, on="ProductID", how="left")

In [5]:
# Aggregate customer profiles based on transaction history
customer_profiles = transactions_enriched.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    total_quantity=("Quantity", "sum"),
    avg_transaction_value=("TotalValue", "mean"),
    unique_products=("ProductID", "nunique"),
    categories=("Category", lambda x: ",".join(x.unique())), # List of unique categories purchased
).reset_index()

In [6]:
# Add customer-level data from Customers.csv
customer_profiles = customer_profiles.merge(customers, on="CustomerID", how="left")

In [7]:
# Display aggregated customer profiles
print(customer_profiles.head())

  CustomerID  total_spent  total_quantity  avg_transaction_value  \
0      C0001      3354.52              12                670.904   
1      C0002      1862.74              10                465.685   
2      C0003      2725.38              14                681.345   
3      C0004      5354.88              23                669.360   
4      C0005      2034.24               7                678.080   

   unique_products                       categories        CustomerName  \
0                5     Books,Home Decor,Electronics    Lawrence Carroll   
1                4              Home Decor,Clothing      Elizabeth Lutz   
2                4  Home Decor,Clothing,Electronics      Michael Rivera   
3                8     Books,Home Decor,Electronics  Kathleen Rodriguez   
4                3           Home Decor,Electronics         Laura Weber   

          Region  SignupDate  
0  South America  2022-07-10  
1           Asia  2022-02-13  
2  South America  2024-03-07  
3  South America

In [8]:
# Vectorize categorical data (categories and region)
vectorizer = CountVectorizer()
categories_matrix = vectorizer.fit_transform(customer_profiles["categories"])
region_dummies = pd.get_dummies(customer_profiles["Region"])


In [9]:
# Normalize numerical data
numerical_features = customer_profiles[["total_spent", "total_quantity", "avg_transaction_value", "unique_products"]]
scaler = StandardScaler()
numerical_matrix = scaler.fit_transform(numerical_features)

In [10]:
# Combine all feature sets into a single matrix
combined_features = np.hstack([numerical_matrix, categories_matrix.toarray(), region_dummies.values])

In [11]:
# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(combined_features)

In [12]:
def get_top_lookalikes(customer_index, similarity_matrix, customer_profiles, top_n=3):
    similarity_scores = similarity_matrix[customer_index]
    top_indices = np.argsort(similarity_scores)[::-1][1 : top_n + 1]  # Exclude itself
    return [
        (customer_profiles.iloc[idx]["CustomerID"], similarity_scores[idx])
        for idx in top_indices
    ]

In [13]:
# For the first 20 customers (C0001 to C0020)
top_20_customers = customer_profiles[customer_profiles["CustomerID"].str.startswith("C00")].head(20)
lookalike_map = {}

for i, cust_id in enumerate(top_20_customers["CustomerID"]):
    customer_index = customer_profiles[customer_profiles["CustomerID"] == cust_id].index[0]
    lookalike_map[cust_id] = get_top_lookalikes(customer_index, similarity_matrix, customer_profiles)

In [14]:
# Convert lookalike map to a DataFrame
lookalike_df = pd.DataFrame(
    [
        {"CustomerID": cust_id, "Lookalikes": str(lookalikes)}
        for cust_id, lookalikes in lookalike_map.items()
    ]
)


In [17]:
# Save the lookalike recommendations to CSV
output_path = "Lookalike.csv"
lookalike_df.to_csv(output_path, index=False)

# Print lookalike recommendations
for cust_id, lookalikes in lookalike_map.items():
    print(f"CustomerID: {cust_id}\nTop Lookalikes: {lookalikes}\n")

print("Lookalike recommendations saved to:", output_path)

CustomerID: C0001
Top Lookalikes: [('C0152', 0.9896746353771764), ('C0174', 0.9795584938611778), ('C0190', 0.8950243618291727)]

CustomerID: C0002
Top Lookalikes: [('C0043', 0.9136068474099296), ('C0159', 0.896457003401917), ('C0134', 0.870177731355959)]

CustomerID: C0003
Top Lookalikes: [('C0129', 0.9141007494240401), ('C0031', 0.9088227454860375), ('C0190', 0.8846862683578113)]

CustomerID: C0004
Top Lookalikes: [('C0012', 0.9412614116801767), ('C0113', 0.9334987333389898), ('C0102', 0.9239062544382644)]

CustomerID: C0005
Top Lookalikes: [('C0007', 0.9436984783502785), ('C0140', 0.9164265528395711), ('C0146', 0.890125019586603)]

CustomerID: C0006
Top Lookalikes: [('C0187', 0.9609403801708463), ('C0079', 0.8723190164012803), ('C0196', 0.8674274176742649)]

CustomerID: C0007
Top Lookalikes: [('C0140', 0.9740444747856722), ('C0005', 0.9436984783502785), ('C0110', 0.8561889046975272)]

CustomerID: C0008
Top Lookalikes: [('C0098', 0.9268507278627724), ('C0109', 0.9110230679403942), ('C