**Task 2: Lookalike Model**

import libraries

In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


## Load datasets


In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


# Merging transactions with customers and Merge the result with products

In [4]:
data = pd.merge(transactions, customers, on='CustomerID')
data = pd.merge(data, products, on='ProductID')

In [6]:
data.rename(columns={'Price_x': 'TransactionPrice', 'Price_y': 'ProductPrice'}, inplace=True) # Rename duplicate price columns for clarity
print(data.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  TransactionPrice     CustomerName         Region  SignupDate  \
0      300.68            300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68            300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68            300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36            300.68  Travis Campbell  South America  2024-04-11   
4      902.04            300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  ProductPrice  
0  ComfortLiving Bluetooth Speaker

 Aggregate transaction data

In [9]:
customer_profiles = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total money spent
    'Quantity': 'sum',  # Total items purchased
    'TransactionPrice': 'mean',  # Average transaction price
    'Region': 'first'  # Region of the customer
}).reset_index()

print(customer_profiles.head())


  CustomerID  TotalValue  Quantity  TransactionPrice         Region
0      C0001     3354.52        12        278.334000  South America
1      C0002     1862.74        10        208.920000           Asia
2      C0003     2725.38        14        195.707500  South America
3      C0004     5354.88        23        240.636250  South America
4      C0005     2034.24         7        291.603333           Asia


Standardize numerical data for similarity calculation

In [10]:
features = customer_profiles[['TotalValue', 'Quantity', 'TransactionPrice']]
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
print(features_scaled[:5])

[[-0.06170143 -0.12203296  0.09467022]
 [-0.87774353 -0.44800021 -0.90401592]
 [-0.40585722  0.20393428 -1.09410928]
 [ 1.03254704  1.67078689 -0.44770193]
 [-0.78392861 -0.93695108  0.28558127]]


Use cosine similarity to find relationships between customers

In [11]:
similarity_matrix = cosine_similarity(features_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])
print(similarity_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.104513 -0.524923 -0.925208  0.909351  0.442395   
C0002       0.104513  1.000000  0.791531 -0.464035  0.506433 -0.844066   
C0003      -0.524923  0.791531  1.000000  0.172432 -0.124725 -0.994780   
C0004      -0.925208 -0.464035  0.172432  1.000000 -0.990272 -0.083333   
C0005       0.909351  0.506433 -0.124725 -0.990272  1.000000  0.029596   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.957854 -0.980620  0.885035 -0.268370  ...  0.953552  0.875392   
C0002      -0.126391 -0.208586  0.552510  0.929885  ...  0.366172  0.561020   
C0003      -0.694381  0.426063 -0.070251  0.960431  ... -0.270712 -0.056387   
C0004      -0.786871  0.960972 -0.985116 -0.108724  ... -0.969254 -0.975266   
C0005  

Top 3 most similar customers for each customer

In [12]:
def get_top_3_similar(customers_df, similarity_df):
    lookalikes = {}
    for customer_id in similarity_df.index:
        similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]
        lookalikes[customer_id] = list(zip(similar_customers.index, similar_customers.values))
    return lookalikes

lookalike_dict = get_top_3_similar(customer_profiles, similarity_df)

lookalike_list = [
    {'CustomerID': cust_id, 'Lookalikes': lookalike}
    for cust_id, lookalike in lookalike_dict.items()
]
lookalike_df = pd.DataFrame(lookalike_list)
print(lookalike_df.head())

  CustomerID                                         Lookalikes
0      C0001  [(C0103, 0.9975729385618538), (C0092, 0.996878...
1      C0002  [(C0029, 0.9998543931340029), (C0077, 0.996103...
2      C0003  [(C0111, 0.9984874468302141), (C0190, 0.996656...
3      C0004  [(C0165, 0.9983897071764074), (C0162, 0.998086...
4      C0005  [(C0167, 0.9999721868436701), (C0020, 0.999714...


Saving as CSV file

In [13]:
lookalike_df = lookalike_df[lookalike_df['CustomerID'].isin(customer_profiles['CustomerID'][:20])]
lookalike_df.to_csv('Jaswanth_Kumar_Lookalike.csv', index=False)
print("Lookalike results saved successfully!")


Lookalike results saved successfully!
