In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
#Data Preparation

In [4]:
# Load data

In [5]:
customers = pd.read_csv('Customers.csv', parse_dates=['SignupDate'])
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv', parse_dates=['TransactionDate'])

In [6]:
txn_prod = transactions.merge(products, on='ProductID')

In [7]:
last_txn_date = transactions['TransactionDate'].max()
customers['TenureDays'] = (last_txn_date - customers['SignupDate']).dt.days
profile_features = pd.get_dummies(customers, columns=['Region'])

In [8]:
# Transaction behavior features
txn_agg = transactions.groupby('CustomerID').agg(
    TotalTransactions=('TransactionID', 'count'),
    TotalSpend=('TotalValue', 'sum'),
    AvgQuantity=('Quantity', 'mean'),
    PurchaseFrequency=('TransactionDate', lambda x: (x.max() - x.min()).days)
).reset_index()

In [9]:
category_pref = txn_prod.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0)
category_pref = category_pref.div(category_pref.sum(axis=1), axis=0).fillna(0)

In [10]:
features = profile_features.merge(txn_agg, on='CustomerID', how='left')\
             .merge(category_pref, on='CustomerID', how='left')\
             .set_index('CustomerID').fillna(0)\
             .drop(['CustomerName', 'SignupDate'], axis=1)


In [11]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
cosine_sim = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(cosine_sim, index=features.index, columns=features.index)

In [12]:
target_customers = [f'C{str(i+1).zfill(4)}' for i in range(20)]
lookalike_mapping = {}

for customer in target_customers:
    sim_scores = similarity_df.loc[customer].sort_values(ascending=False).drop(customer).head(3)
    lookalike_mapping[customer] = list(zip(sim_scores.index, np.round(sim_scores.values, 2)))

In [13]:
output_data = []
for cust_id, matches in lookalike_mapping.items():
    formatted_matches = ','.join([f"({match[0]},{match[1]})" for match in matches])
    output_data.append({'CustomerID': cust_id, 'Lookalikes': formatted_matches})

pd.DataFrame(output_data).to_csv('Lookalike.csv', index=False)
print("Lookalike model results saved successfully!")

Lookalike model results saved successfully!


In [14]:
# Quick check of results
print(pd.read_csv('Lookalike.csv').head(3))

  CustomerID                              Lookalikes
0      C0001  (C0118,0.74),(C0168,0.74),(C0120,0.74)
1      C0002   (C0159,0.94),(C0134,0.92),(C0106,0.9)
2      C0003  (C0129,0.89),(C0151,0.81),(C0031,0.79)
