In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customers_df = pd.read_csv(r'C:\Users\ASUS\Desktop\PythonProjects\Assignment_Zeotap\Datasets\Customers.csv')

transactions_df = pd.read_csv(r'C:\Users\ASUS\Desktop\PythonProjects\Assignment_Zeotap\Datasets\Transactions.csv')

products_df = pd.read_csv(r'C:\Users\ASUS\Desktop\PythonProjects\Assignment_Zeotap\Datasets\Products.csv')

In [3]:
customers_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
products_df.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [5]:
transactions_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [6]:
# Merge transactions with product data

transactions = transactions_df.merge(products_df, on="ProductID", how="left")

In [7]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [8]:
# drop Price_y

transactions.drop(columns=['Price_y'], axis=1, inplace=True)

In [9]:
# Rename Price_x to Price

transactions.rename(columns={'Price_x': 'Price'}, inplace=True)

In [10]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,ProductName,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics


In [11]:
# Aggregate transaction data by customer

customer_transactions = transactions.groupby("CustomerID").agg(
    {
        "Quantity": "sum",
        "TotalValue": "sum",
        "Price": "mean",
        "Category": lambda x: x.mode()[0] if not x.mode().empty else None,
    }
).reset_index()

In [12]:
# Merge aggregated data with customer data

customer_profiles = customers_df.merge(customer_transactions, on="CustomerID", how="left")

In [13]:
# Handle missing values 

customer_profiles.fillna({"Quantity": 0, "TotalValue": 0, "Price": 0}, inplace=True)

In [14]:
# Convert categorical data into numerical format using one-hot encoding

customer_profiles = pd.get_dummies(customer_profiles, columns=["Region", "Category"], drop_first=True)

In [15]:
# Feature Scaling

scaler = StandardScaler()
customer_features = scaler.fit_transform(customer_profiles.drop(["CustomerID", "CustomerName", "SignupDate"], axis=1))

In [16]:
# Calculate cosine similarity

similarity_matrix = cosine_similarity(customer_features)

In [17]:
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles["CustomerID"], columns=customer_profiles["CustomerID"])

In [18]:
# Generate Lookalike Recommendations

def get_top_lookalikes(customer_id, top_n=3):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:top_n + 1]
    return list(zip(similar_customers.index, similar_customers.values))

In [19]:
lookalike_map = {}
for customer_id in customers_df["CustomerID"].iloc[:20]:  # First 20 customers
    lookalike_map[customer_id] = get_top_lookalikes(customer_id)

In [20]:
# Save lookalike map to Lookalike.csv

lookalike_output = []
for cust_id, lookalikes in lookalike_map.items():
    for similar_cust, score in lookalikes:
        lookalike_output.append([cust_id, similar_cust, score])

In [21]:
lookalike_df = pd.DataFrame(lookalike_output, columns=["CustomerID", "SimilarCustomerID", "SimilarityScore"])

In [22]:
lookalike_df.to_csv("Harshita_bachhane_Lookalike.csv", index=False)