# Task 2: Lookalike Model


In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [3]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions - Transactions.csv")

In [4]:
# Inspect datasets
def inspect_data(df, name):
    print(f"\n{name} Dataset Overview")
    print("-" * 50)
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print("Sample Data:")
    print(df.head(), "\n")
    print("Missing Values:")
    print(df.isnull().sum(), "\n")

inspect_data(customers, "Customers")
inspect_data(products, "Products")
inspect_data(transactions, "Transactions")



Customers Dataset Overview
--------------------------------------------------
Shape: (200, 4)
Columns: ['CustomerID', 'CustomerName', 'Region', 'SignupDate']
Sample Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15 

Missing Values:
CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64 


Products Dataset Overview
--------------------------------------------------
Shape: (100, 4)
Columns: ['ProductID', 'ProductName', 'Category', 'Price']
Sample Data:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiv

In [5]:
# Merge datasets
data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')


In [6]:
# Check columns in the merged dataset
print("Columns in merged data:", data.columns)

Columns in merged data: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


## Feature engineering for lookalike model

In [7]:
# Aggregate customer-level transaction data
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price_x': 'mean'
}).reset_index()

# Rename columns for clarity
customer_features.columns = ['CustomerID', 'TotalSpending', 'TotalQuantity', 'AvgPrice']


In [8]:
# One-hot encode region
region_encoded = pd.get_dummies(customers[['CustomerID', 'Region']], columns=['Region'])
customer_features = customer_features.merge(region_encoded, on='CustomerID')


In [9]:
# Normalize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))


In [10]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


In [11]:
# Generate lookalike recommendations for first 20 customers
lookalike_results = {}
for customer_id in customer_features['CustomerID'][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    lookalike_results[customer_id] = [(sim_cust, score) for sim_cust, score in similar_customers.items()]



In [12]:
# Save lookalike results to CSV
lookalike_output = []
for cust_id, recommendations in lookalike_results.items():
    for rec_cust, score in recommendations:
        lookalike_output.append({'CustomerID': cust_id, 'SimilarCustomerID': rec_cust, 'SimilarityScore': score})
lookalike_df = pd.DataFrame(lookalike_output)
lookalike_df.to_csv("Jyotika_Jayani_Lookalike.csv", index=False)

In [13]:
# Display sample recommendations
print("Sample Lookalike Recommendations:\n")
print(lookalike_df.head(20))

Sample Lookalike Recommendations:

   CustomerID SimilarCustomerID  SimilarityScore
0       C0001             C0137         0.992950
1       C0001             C0191         0.989996
2       C0001             C0011         0.983569
3       C0002             C0088         0.991240
4       C0002             C0142         0.990460
5       C0002             C0178         0.973887
6       C0003             C0190         0.986809
7       C0003             C0147         0.972504
8       C0003             C0174         0.961934
9       C0004             C0113         0.987222
10      C0004             C0165         0.967831
11      C0004             C0012         0.964326
12      C0005             C0140         0.993102
13      C0005             C0186         0.987795
14      C0005             C0123         0.981008
15      C0006             C0048         0.984436
16      C0006             C0184         0.971585
17      C0006             C0107         0.970204
18      C0007             C0146   