### Import necessary Libraries

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from datetime import datetime

### Load and Pre process data

In [2]:
# Load the data
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
print(customers.info())
print(products.info())
print(transactions.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------

In [4]:
print(customers.isnull().sum())
print(products.isnull().sum())
print(transactions.isnull().sum())

CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64
ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64
TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64


In [5]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [6]:
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')

In [7]:
merged_data.columns.tolist(), merged_data.shape

(['TransactionID',
  'CustomerID',
  'ProductID',
  'TransactionDate',
  'Quantity',
  'TotalValue',
  'Price_x',
  'CustomerName',
  'Region',
  'SignupDate',
  'ProductName',
  'Category',
  'Price_y'],
 (1000, 13))

### Feature Engineering
Develop features to represent customer profiles and transaction history

In [8]:
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'ProductID': 'nunique',
    'Price_x': 'mean',
    'SignupDate': lambda x: (pd.Timestamp.now() - x.min()).days
}).rename(columns={
    'TotalValue': 'TotalSpending',
    'TransactionID': 'TotalTransactions',
    'ProductID': 'ProductDiversity',
    'Price_x': 'AvgPrice',
    'SignupDate': 'SignupRecency'
}).reset_index()

# Normalize features
scaler = StandardScaler()
feature_matrix = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))
customer_ids = customer_features['CustomerID'].values

### Compute Similarity

In [9]:
similarity_matrix = cosine_similarity(feature_matrix)

# Get top 3 lookalikes
lookalikes = {}
for idx, customer_id in enumerate(customer_ids[:20]):  # First 20 customers
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_ids[i], round(score, 2)) for i, score in similarity_scores[1:4]]
    lookalikes[customer_id] = top_3

# Save Lookalike.csv
output = []
for cust_id, similar_customers in lookalikes.items():
    row = [cust_id]
    for sim_cust, score in similar_customers:
        row.extend([sim_cust, score])
    output.append(row)

lookalike_df = pd.DataFrame(output, columns=['cust_id', 
                                             'similar_cust_id_1', 'score_1', 
                                             'similar_cust_id_2', 'score_2', 
                                             'similar_cust_id_3', 'score_3'])
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv generated!")

Lookalike.csv generated!
