In [1]:
import pandas as pd
customers = pd.read_csv("/content/Customers.csv")
products = pd.read_csv("/content/Products.csv")
transactions = pd.read_csv("/content/Transactions.csv")


In [2]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [7]:
# Merge transactions with customer data
transactions_customers = transactions.merge(customers, on='CustomerID')
# Merge with product data
full_data = transactions_customers.merge(products, on='ProductID')


In [8]:
# how much a customer spends overall.High spenders might have similar spending patterns.
total_value = transactions.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_value.rename(columns={'TotalValue': 'TotalTransactionValue'}, inplace=True)

In [9]:
# how much a customer spends per purchase.Customers with similar spending levels may have similar buying habits.
avg_value = transactions.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_value.rename(columns={'TotalValue': 'AvgTransactionValue'}, inplace=True)


In [10]:
# how much a customer buys in terms of volume. Bulk buyers may share similar preferences.
total_quantity = transactions.groupby('CustomerID')['Quantity'].sum().reset_index()
total_quantity.rename(columns={'Quantity': 'TotalQuantity'}, inplace=True)


In [11]:
# Frequency count - customers prefer specific product categories
product_category_counts = full_data.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0)


In [13]:
#frequency count - customer’s favorite product
most_frequent_product = full_data.groupby('CustomerID')['ProductID'].agg(
    lambda x: x.value_counts().idxmax()).reset_index()
most_frequent_product.rename(columns={'ProductID': 'MostFrequentProduct'}, inplace=True)


In [14]:
# purchase behaviour region wise
region_encoded = pd.get_dummies(customers[['CustomerID', 'Region']], columns=['Region'])


In [16]:
from datetime import datetime

customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['SignupDuration'] = (datetime.now() - customers['SignupDate']).dt.days


In [17]:
# Merge features with the customer base
customer_features = customers[['CustomerID']]
customer_features = customer_features.merge(total_value, on='CustomerID', how='left')
customer_features = customer_features.merge(avg_value, on='CustomerID', how='left')
customer_features = customer_features.merge(total_quantity, on='CustomerID', how='left')
customer_features = customer_features.merge(product_category_counts, on='CustomerID', how='left')
customer_features = customer_features.merge(region_encoded, on='CustomerID', how='left')
customer_features = customer_features.merge(most_frequent_product, on='CustomerID', how='left')
customer_features = customer_features.merge(customers[['CustomerID', 'SignupDuration']], on='CustomerID', how='left')


In [27]:
# Check for missing values in the normalized data
import pandas as pd

# If `normalized_features` is a DataFrame, check for NaNs
pd.DataFrame(normalized_features).isna().sum()


Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [28]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  # Or 'median'
normalized_features = imputer.fit_transform(normalized_features)


In [30]:
import numpy as np
normalized_features = normalized_features[~np.isnan(normalized_features).any(axis=1)]


In [32]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute pairwise similarity
similarity_matrix = cosine_similarity(normalized_features)
print(similarity_matrix)


[[ 1.         -0.16395168  0.35056588 ...  0.10238373  0.21426112
  -0.31506074]
 [-0.16395168  1.         -0.0104745  ...  0.36199634  0.17873118
   0.40426495]
 [ 0.35056588 -0.0104745   1.         ... -0.15410117  0.03132575
  -0.21869966]
 ...
 [ 0.10238373  0.36199634 -0.15410117 ...  1.          0.73084274
  -0.32467597]
 [ 0.21426112  0.17873118  0.03132575 ...  0.73084274  1.
  -0.44303295]
 [-0.31506074  0.40426495 -0.21869966 ... -0.32467597 -0.44303295
   1.        ]]


In [33]:
lookalikes = {}

for i, customer_id in enumerate(customers['CustomerID']):
    # similarity scores for customer
    scores = similarity_matrix[i]

    # Get indices of top 3 most similar customers (excluding the current customer)
    similar_indices = scores.argsort()[-4:-1][::-1]  # Top 3 excluding self
    similar_customers = [
        (customers.iloc[idx]['CustomerID'], scores[idx]) for idx in similar_indices
    ]

    # Store in the lookalikes dictionary
    lookalikes[customer_id] = similar_customers


In [34]:
lookalike_data = []

for customer_id, similar_list in lookalikes.items():
    for similar_id, score in similar_list:
        lookalike_data.append({'cust_id': customer_id, 'similar_cust_id': similar_id, 'score': score})

# Save as Lookalike as csv
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)
