In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

In [2]:
customer_file_url = "https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE"
customers = pd.read_csv(customer_file_url)

products_file_url = "https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0"
products = pd.read_csv(products_file_url)

transactions_file_url = "https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF"
transactions = pd.read_csv(transactions_file_url)

# Merge customers and transactions to get product information for each transaction
customer_transactions = pd.merge(transactions, customers, on='CustomerID', how='left')

# Merge the products data to get product names
customer_transactions = pd.merge(customer_transactions, products, on='ProductID', how='left')

In [3]:
# Aggregate customer-level features
customer_transactions['SignupYear'] = pd.to_datetime(customer_transactions['SignupDate']).dt.year
customer_profile = customer_transactions.groupby('CustomerID').agg(
    total_transactions=('TransactionID', 'count'),
    total_quantity=('Quantity', 'sum'),
    total_spent=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    region=('Region', 'first'),
    signup_year=('SignupYear', 'first')
).reset_index()

# Aggregate product category purchase data
category_spending = customer_transactions.groupby(['CustomerID', 'Category']).agg(
    category_spend=('TotalValue', 'sum')
).unstack(fill_value=0)
category_spending.columns = category_spending.columns.droplevel()

customer_data = pd.merge(customer_profile, category_spending, on='CustomerID')

In [4]:
numerical_cols = ['total_transactions', 'total_quantity', 'total_spent', 'avg_transaction_value'] + list(category_spending.columns)
categorical_cols = ['region']

# Create a preprocessor with both one-hot encoding for categorical data and scaling for numerical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Apply the transformations and concatenate them
customer_data_transformed = preprocessor.fit_transform(customer_data.drop(columns='CustomerID'))

# Reattach CustomerID to the scaled data
scaled_customer_data = pd.DataFrame(customer_data_transformed, columns=numerical_cols + list(preprocessor.transformers_[1][1].get_feature_names_out(categorical_cols)))

scaled_customer_data['CustomerID'] = customer_data['CustomerID']

In [5]:
# Calculate the cosine similarity matrix for all customers
customer_similarity_matrix = cosine_similarity(scaled_customer_data.drop(columns='CustomerID'))
similarity_df = pd.DataFrame(customer_similarity_matrix, index=scaled_customer_data['CustomerID'], columns=scaled_customer_data['CustomerID'])

In [6]:
# Compute the lookalikes

lookalike_dict = {}

for customer_id in scaled_customer_data['CustomerID'].iloc[:20]:
    similarity_scores = similarity_df[customer_id].drop(customer_id)
    top_3_similar_customers = similarity_scores.nlargest(3)
    lookalike_list = [(lookalike_id, score) for lookalike_id, score in top_3_similar_customers.items()]
    lookalike_dict[customer_id] = lookalike_list

lookalike_df = pd.DataFrame(list(lookalike_dict.items()), columns=['CustomerID', 'Lookalikes'])

lookalike_df['Lookalikes'] = lookalike_df['Lookalikes'].apply(lambda x: str(x))

lookalike_df.to_csv('Moyank_Giri_Lookalike.csv', index=False)

In [7]:
lookalike_df.head(20)

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0120', 0.8597432058158425), ('C0181', 0.84..."
1,C0002,"[('C0159', 0.8689822386597825), ('C0128', 0.84..."
2,C0003,"[('C0152', 0.8025800777256402), ('C0031', 0.77..."
3,C0004,"[('C0075', 0.9063076603462492), ('C0065', 0.87..."
4,C0005,"[('C0007', 0.9113053514061077), ('C0123', 0.86..."
5,C0006,"[('C0196', 0.7802232625204011), ('C0185', 0.76..."
6,C0007,"[('C0005', 0.9113053514061077), ('C0140', 0.84..."
7,C0008,"[('C0109', 0.8977355689831823), ('C0024', 0.82..."
8,C0009,"[('C0060', 0.9697146762451263), ('C0014', 0.95..."
9,C0010,"[('C0111', 0.9355569075844422), ('C0062', 0.92..."
