# Task2: Lookalike 

In [6]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [11]:
# Dataset loadin
customers_df = pd.read_csv('Customers.csv')
print(customers_df.head(3))
print("\n")

products_df = pd.read_csv('Products.csv')
print(products_df.head(3))
print("\n")

transactions_df = pd.read_csv('Transactions.csv')
print(transactions_df.head(3))


  CustomerID      CustomerName         Region  SignupDate
0      C0001  Lawrence Carroll  South America  2022-07-10
1      C0002    Elizabeth Lutz           Asia  2022-02-13
2      C0003    Michael Rivera  South America  2024-03-07


  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  


In [12]:
# Merging datasets for a unified view
merged_data = transactions_df.merge(customers_df, on='CustomerID').merge(products_df, on='ProductID')

In [15]:
# Feature engineering
# Aggregating behavioral attributes per customer
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spend
    'Quantity': 'sum',    # Total quantity purchased
    'Price_x': 'mean',    # Average price of purchased products
    'Category': lambda x: x.nunique()  # Number of unique categories purchased
}).reset_index()

In [16]:
# Adding customer attributes (Region as categorical, SignupDate as numeric)
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
customers_df['SignupDays'] = (customers_df['SignupDate'] - customers_df['SignupDate'].min()).dt.days
customer_features = customer_features.merge(
    customers_df[['CustomerID', 'Region', 'SignupDays']], on='CustomerID', how='left'
)


In [17]:
# One-hot encoding for Region (categorical variable)
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

In [18]:
# Normalize numerical features
scaler = StandardScaler()
numerical_columns = ['TotalValue', 'Quantity', 'Price_x', 'Category', 'SignupDays']
customer_features[numerical_columns] = scaler.fit_transform(customer_features[numerical_columns])

In [20]:
# Compute similarity scores using cosine similarity
customer_ids = customer_features['CustomerID']
features = customer_features.drop(columns=['CustomerID'])
similarity_matrix = cosine_similarity(features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)

In [21]:
# Find top 3 lookalikes for each customer
lookalike_data = {}
for cust_id in customer_ids[:20]:  # For customers C0001 to C0020
    similar_customers = similarity_df[cust_id].sort_values(ascending=False)[1:4]  # Exclude self
    lookalike_data[cust_id] = similar_customers.to_dict()

In [22]:
# Save lookalike data to CSV
lookalike_df = pd.DataFrame.from_dict(
    {k: [v] for k, v in lookalike_data.items()}, orient='index', columns=['Lookalikes']
)
lookalike_df.index.name = 'cust_id'
lookalike_csv_path = 'Lookalike.csv'
lookalike_df.to_csv(lookalike_csv_path)

print(f"Lookalike data saved to: {lookalike_csv_path}")

Lookalike data saved to: Lookalike.csv
