Lookalike Model


In [1]:
import pandas as pd


customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
print(customers.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  
3      601.36  300.68  
4      902.04  300.68  


In [2]:
data = pd.merge(customers, transactions, on="CustomerID", how="inner")
print(data.head())


  CustomerID      CustomerName         Region  SignupDate TransactionID  \
0      C0001  Lawrence Carroll  South America  2022-07-10        T00015   
1      C0001  Lawrence Carroll  South America  2022-07-10        T00932   
2      C0001  Lawrence Carroll  South America  2022-07-10        T00085   
3      C0001  Lawrence Carroll  South America  2022-07-10        T00445   
4      C0001  Lawrence Carroll  South America  2022-07-10        T00436   

  ProductID      TransactionDate  Quantity  TotalValue   Price  
0      P054  2024-01-19 03:12:55         2      114.60   57.30  
1      P022  2024-09-17 09:01:18         3      412.62  137.54  
2      P096  2024-04-08 00:01:00         2      614.94  307.47  
3      P083  2024-05-07 03:11:44         2      911.44  455.72  
4      P029  2024-11-02 17:04:16         3     1300.92  433.64  


In [5]:
selected_features = ['CustomerID', 'CustomerName', 'Region', 'SignupDate', 'Quantity']
data = data[selected_features]


In [6]:
data = pd.get_dummies(data, columns=['Region', 'SignupDate'], drop_first=True)


In [15]:
from sklearn.preprocessing import StandardScaler

numerical_cols = ['Quantity']
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


In [16]:
print(data.head())


  CustomerID      CustomerName  Quantity  Region_Europe  Region_North America  \
0      C0001  Lawrence Carroll -0.480571          False                 False   
1      C0001  Lawrence Carroll  0.414347          False                 False   
2      C0001  Lawrence Carroll -0.480571          False                 False   
3      C0001  Lawrence Carroll -0.480571          False                 False   
4      C0001  Lawrence Carroll  0.414347          False                 False   

   Region_South America  SignupDate_2022-02-02  SignupDate_2022-02-10  \
0                  True                  False                  False   
1                  True                  False                  False   
2                  True                  False                  False   
3                  True                  False                  False   
4                  True                  False                  False   

   SignupDate_2022-02-13  SignupDate_2022-02-19  ...  SignupDate_2024-10-2

In [17]:
customer_features = data.drop(columns=['CustomerID'])


In [19]:
print(customer_features.dtypes)


CustomerName              object
Quantity                 float64
Region_Europe               bool
Region_North America        bool
Region_South America        bool
                          ...   
SignupDate_2024-11-13       bool
SignupDate_2024-11-15       bool
SignupDate_2024-11-18       bool
SignupDate_2024-11-22       bool
SignupDate_2024-12-28       bool
Length: 182, dtype: object


In [20]:
customer_features = customer_features.select_dtypes(include=['float64', 'int64'])


In [21]:
customer_features = pd.get_dummies(customer_features, drop_first=True)


In [22]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(customer_features)


In [23]:

customer_features = customer_features.select_dtypes(include=['float64', 'int64'])


customer_features = pd.get_dummies(customer_features, drop_first=True)

similarity_matrix = cosine_similarity(customer_features)


In [43]:
similarity_df = pd.DataFrame(similarity_matrix, index=data['CustomerID'], columns=data['CustomerID'])
print(similarity_df.head())


CustomerID  C0001  C0001  C0001  C0001  C0001  C0002  C0002  C0002  C0002  \
CustomerID                                                                  
C0001         1.0   -1.0    1.0    1.0   -1.0    1.0   -1.0    1.0    1.0   
C0001        -1.0    1.0   -1.0   -1.0    1.0   -1.0    1.0   -1.0   -1.0   
C0001         1.0   -1.0    1.0    1.0   -1.0    1.0   -1.0    1.0    1.0   
C0001         1.0   -1.0    1.0    1.0   -1.0    1.0   -1.0    1.0    1.0   
C0001        -1.0    1.0   -1.0   -1.0    1.0   -1.0    1.0   -1.0   -1.0   

CustomerID  C0003  ...  C0198  C0199  C0199  C0199  C0199  C0200  C0200  \
CustomerID         ...                                                    
C0001        -1.0  ...    1.0    1.0   -1.0    1.0    1.0   -1.0    1.0   
C0001         1.0  ...   -1.0   -1.0    1.0   -1.0   -1.0    1.0   -1.0   
C0001        -1.0  ...    1.0    1.0   -1.0    1.0    1.0   -1.0    1.0   
C0001        -1.0  ...    1.0    1.0   -1.0    1.0    1.0   -1.0    1.0   
C0001     

In [44]:
target_customers = data['CustomerID'][:20]


In [67]:
similar_customers = similarity_df[customer].sort_index(ascending=False)[1:4]  # Exclude self (highest score)

In [68]:
lookalike_map = {}

for customer in target_customers:

    similar_customers = similarity_df[customer].sort_index(ascending=False)[1:4]


    lookalike_map[customer] = [{'CustomerID': cust_id, 'Score': score}
                               for cust_id, score in zip(similar_customers.index, similar_customers.values)]


lookalike_list = []

for customer, lookalikes in lookalike_map.items():
    heir top 3 lookalikes
    for idx, lookalike in enumerate(lookalikes):
        lookalike_list.append([customer, idx+1, lookalike['CustomerID'], lookalike['Score']])


lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'Rank', 'LookalikeCustomerID', 'SimilarityScore'])

# Save to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)
