In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Preview data
print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [7]:
# Merging customers with transactions and products
transactions = transactions.merge(customers, on='CustomerID', how='left')
transactions = transactions.merge(products, on='ProductID', how='left')

# Create a "TotalSpend" feature per customer
customer_spend = transactions.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_spend.columns = ['CustomerID', 'TotalSpend']

# Create a "Frequency" feature per customer
customer_freq = transactions.groupby('CustomerID')['TransactionID'].nunique().reset_index()
customer_freq.columns = ['CustomerID', 'TransactionFrequency']

# Merge all features into a single dataframe
customer_features = customers.merge(customer_spend, on='CustomerID', how='left')
customer_features = customer_features.merge(customer_freq, on='CustomerID', how='left')

# Fill missing values with 0 for any customer with no transactions
customer_features.fillna(0, inplace=True)

# Preview the final feature set
print(customer_features.head())


  CustomerID        CustomerName         Region  SignupDate  TotalSpend  \
0      C0001    Lawrence Carroll  South America  2022-07-10     3354.52   
1      C0002      Elizabeth Lutz           Asia  2022-02-13     1862.74   
2      C0003      Michael Rivera  South America  2024-03-07     2725.38   
3      C0004  Kathleen Rodriguez  South America  2022-10-09     5354.88   
4      C0005         Laura Weber           Asia  2022-08-15     2034.24   

   TransactionFrequency  
0                   5.0  
1                   4.0  
2                   4.0  
3                   8.0  
4                   3.0  


In [9]:
# One-hot encoding for 'Region' and 'Category'
customer_features = pd.get_dummies(customer_features, columns=['Region'])

# One-hot encoding product categories
category_counts = transactions.groupby(['CustomerID', 'Category']).size().unstack().fillna(0)
category_counts.columns = [f"Category_{col}" for col in category_counts.columns]
customer_features = customer_features.merge(category_counts, on='CustomerID', how='left')

# Standardizing features (excluding 'CustomerID' and other non-numeric columns)
features = customer_features.drop(['CustomerID', 'CustomerName', 'SignupDate'], axis=1)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Preview the scaled features
print(features_scaled[:5])


[[-0.05188436  0.         -0.53881591 -0.57735027 -0.54653573  1.54590766
  -0.3211125  -1.04160638  1.55087763 -0.22104388]
 [-0.86271433 -0.45129368  1.85592145 -0.57735027 -0.54653573 -0.64686916
  -1.22113205  0.77663634 -1.14846331  0.67666495]
 [-0.393842   -0.45129368 -0.53881591 -0.57735027 -0.54653573  1.54590766
  -1.22113205 -0.13248502 -0.248683    0.67666495]
 [ 1.03537505  1.35388105 -0.53881591 -0.57735027 -0.54653573  1.54590766
   1.47892659 -1.04160638  0.65109731  1.57437379]
 [-0.76949861 -0.90258736  1.85592145 -0.57735027 -0.54653573 -0.64686916
  -1.22113205 -1.04160638  0.65109731 -0.22104388]]
