In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


In [3]:
merged_data = transactions.merge(customers, on='CustomerID', how='left')
merged_data = merged_data.merge(products, on='ProductID', how='left')
print(merged_data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [4]:
customer_features = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    transaction_count=('TransactionID', 'count'),
    favorite_category=('Category', lambda x: x.mode()[0] if not x.mode().empty else None)
).reset_index()

In [5]:
customer_features = customer_features.merge(
    customers[['CustomerID', 'Region', 'SignupDate']],
    on='CustomerID',
    how='left'
)

# Converting SignupDate to numerical feature
customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate'])
customer_features['days_since_signup'] = (pd.Timestamp.today() - customer_features['SignupDate']).dt.days
customer_features.drop('SignupDate', axis=1, inplace=True)
customer_features = pd.get_dummies(customer_features, columns=['Region', 'favorite_category'], drop_first=True)

print(customer_features.head())

  CustomerID  total_spent  avg_transaction_value  transaction_count  \
0      C0001      3354.52                670.904                  5   
1      C0002      1862.74                465.685                  4   
2      C0003      2725.38                681.345                  4   
3      C0004      5354.88                669.360                  8   
4      C0005      2034.24                678.080                  3   

   days_since_signup  Region_Europe  Region_North America  \
0                931              0                     0   
1               1078              0                     0   
2                325              0                     0   
3                840              0                     0   
4                895              0                     0   

   Region_South America  favorite_category_Clothing  \
0                     1                           0   
1                     0                           1   
2                     1                  

In [6]:
scaler = StandardScaler()
numerical_features = ['total_spent', 'avg_transaction_value', 'transaction_count', 'days_since_signup']
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

# Calculating cosine similarity
similarity_matrix = cosine_similarity(customer_features.drop('CustomerID', axis=1))

# Creating a DataFrame for similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [7]:
def get_top_n_similar(customers, similarity_matrix, top_n=3):
    similar_customers = {}
    for customer_id in customers:
        scores = similarity_matrix[customer_id].sort_values(ascending=False)
        top_similar = scores.iloc[1:top_n + 1]
        similar_customers[customer_id] = [(idx, round(score, 2)) for idx, score in top_similar.items()]
    return similar_customers

# Getting top 3 lookalikes for the first 20 customers
first_20_customers = customer_features['CustomerID'][:20]
lookalike_map = get_top_n_similar(first_20_customers, similarity_df)

# Saving results to Lookalike.csv
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_map.items()
])
lookalike_df.to_csv('Lookalike.csv', index=False)