# **Zeotap Assignment**
## **Task 2: Lookalike Model**

# Libraries 📚

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

# **Loading the Data 📽**

In [20]:
customers_df = pd.read_csv('/content/Customers (1).csv')
products_df = pd.read_csv('/content/Products (1).csv')

In [21]:
customers_df.head(5)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [22]:
products_df.head(5)

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


# **Feature Engineering**

In [3]:
# Calculate account age
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
customers_df['AccountAge'] = (datetime.now() - customers_df['SignupDate']).dt.days

In [23]:
# One-hot encode categorical variables like Region
customers_df = pd.get_dummies(customers_df, columns=['Region'], drop_first=True)
customers_df.head(5)

Unnamed: 0,CustomerID,CustomerName,SignupDate,Region_Europe,Region_North America,Region_South America
0,C0001,Lawrence Carroll,2022-07-10,False,False,True
1,C0002,Elizabeth Lutz,2022-02-13,False,False,False
2,C0003,Michael Rivera,2024-03-07,False,False,True
3,C0004,Kathleen Rodriguez,2022-10-09,False,False,True
4,C0005,Laura Weber,2022-08-15,False,False,False


In [24]:
# transactions - has columns: CustomerID, ProductID, and Quantity/PurchaseAmount
transactions_df = pd.read_csv('/content/Transactions.csv')
transactions_df.head(5)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [25]:
# Merge transactions with product details
transactions_df = pd.merge(transactions_df, products_df[['ProductID', 'Category']], on='ProductID', how='left')
transactions_df.head(5)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Electronics
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Electronics
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Electronics


In [26]:
# Pivot transactions to get product categories per customer
customer_products_df = transactions_df.pivot_table(index='CustomerID', columns='Category', values='Quantity', aggfunc='sum', fill_value=0)
customer_products_df.head(5)

Category,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,2,0,7,3
C0002,0,4,0,6
C0003,0,4,4,6
C0004,8,0,6,9
C0005,0,0,4,3


In [27]:
# Normalize customer features (standardize)
scaler = StandardScaler()
customer_profile_features = customers_df.drop(columns=['CustomerID', 'CustomerName', 'SignupDate'])  # Drop non-numeric columns
customer_profile_scaled = scaler.fit_transform(customer_profile_features)

In [28]:
customer_products_df = customer_products_df.reindex(customers_df['CustomerID']).fillna(0)
customer_products_df.head(5)

Category,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,2.0,0.0,7.0,3.0
C0002,0.0,4.0,0.0,6.0
C0003,0.0,4.0,4.0,6.0
C0004,8.0,0.0,6.0,9.0
C0005,0.0,0.0,4.0,3.0


In [29]:
# Now, the number of rows in both matrices should match
final_customer_features = np.hstack([customer_profile_scaled, customer_products_df.values])
final_customer_features

array([[-0.57735027, -0.54653573,  1.54590766, ...,  0.        ,
         7.        ,  3.        ],
       [-0.57735027, -0.54653573, -0.64686916, ...,  4.        ,
         0.        ,  6.        ],
       [-0.57735027, -0.54653573,  1.54590766, ...,  4.        ,
         4.        ,  6.        ],
       ...,
       [ 1.73205081, -0.54653573, -0.64686916, ...,  2.        ,
         1.        ,  0.        ],
       [ 1.73205081, -0.54653573, -0.64686916, ...,  0.        ,
         3.        ,  6.        ],
       [-0.57735027, -0.54653573, -0.64686916, ...,  7.        ,
         1.        ,  4.        ]])

In [31]:
# Calculate similarity scores
similarity_matrix = cosine_similarity(final_customer_features)
similarity_matrix

array([[1.        , 0.30021202, 0.72137998, ..., 0.22256286, 0.66270936,
        0.36241284],
       [0.30021202, 1.        , 0.84115991, ..., 0.35885976, 0.70257251,
        0.79923331],
       [0.72137998, 0.84115991, 1.        , ..., 0.41390378, 0.78710356,
        0.72436565],
       ...,
       [0.22256286, 0.35885976, 0.41390378, ..., 1.        , 0.32595416,
        0.54697233],
       [0.66270936, 0.70257251, 0.78710356, ..., 0.32595416, 1.        ,
        0.42002779],
       [0.36241284, 0.79923331, 0.72436565, ..., 0.54697233, 0.42002779,
        1.        ]])

In [32]:
# Generate lookalikes for customers C0001 - C0020
lookalike_recommendations = {}

In [33]:
for customer_id in customers_df['CustomerID'][:20]:
    idx = customers_df[customers_df['CustomerID'] == customer_id].index[0]
    similarity_scores = similarity_matrix[idx]

    # Get top 3 similar customers (excluding the customer itself)
    similar_customers_idx = np.argsort(similarity_scores)[::-1][1:4]

    # Map customer_id to similar customer_id and score
    similar_customers = [(customers_df.iloc[i]['CustomerID'], similarity_scores[i]) for i in similar_customers_idx]
    lookalike_recommendations[customer_id] = similar_customers

In [34]:
# Save the recommendations to Lookalike.csv
lookalike_df = []
for customer_id, recommendations in lookalike_recommendations.items():
    for similar_customer, score in recommendations:
        lookalike_df.append([customer_id, similar_customer, score])

lookalike_df = pd.DataFrame(lookalike_df, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike model completed. Check Lookalike.csv for results.")

Lookalike model completed. Check Lookalike.csv for results.
