# Lookalike Model

### import Necessary libraries

In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder

### Load datasets

In [4]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

### Parse date columns

In [5]:
# Convert SignupDate and TransactionDate to datetime for further calculations
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [6]:
# Calculate the number of days since the customer signed up
customers['AccountAge'] = (pd.Timestamp.now() - customers['SignupDate']).dt.days

### One-hot encode region

In [11]:
# Convert the Region column into multiple binary columns for each unique region
encoder = OneHotEncoder(sparse_output=False)  # Use sparse_output instead of sparse
region_encoded = encoder.fit_transform(customers[['Region']])
region_encoded_df = pd.DataFrame(region_encoded, columns=encoder.get_feature_names_out(['Region']))
customers = pd.concat([customers, region_encoded_df], axis=1)


### Aggregate transaction data

In [12]:
# Summarize transaction data for each customer
transaction_summary = transactions.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'TotalValue': 'sum',
    'TransactionID': 'count'
}).rename(columns={
    'Quantity': 'TotalQuantity',
    'TotalValue': 'TotalSpent',
    'TransactionID': 'TransactionCount'
})

### Merge with customer data

In [13]:
# Combine demographic and transaction data into a single DataFrame
customer_features = customers.merge(transaction_summary, on='CustomerID', how='left').fillna(0)

### Standardize numerical features

In [14]:
# Normalize numerical columns to ensure fair similarity computation
scaler = StandardScaler()
numeric_features = ['AccountAge', 'TotalQuantity', 'TotalSpent', 'TransactionCount']
customer_features[numeric_features] = scaler.fit_transform(customer_features[numeric_features])

### Compute similarity

In [15]:
# Use cosine similarity to find customers with similar profiles and behaviors
feature_columns = numeric_features + list(region_encoded_df.columns)
similarity_matrix = cosine_similarity(customer_features[feature_columns])

### Generate top 3 lookalikes for first 20 customers

In [16]:
# For each of the first 20 customers, find the top 3 most similar customers
lookalike_data = {}
for idx, cust_id in enumerate(customer_features['CustomerID'][:20]):
    similar_indices = np.argsort(-similarity_matrix[idx])[1:4]  # Top 3 excluding itself
    similar_customers = customer_features.iloc[similar_indices]['CustomerID']
    similarity_scores = similarity_matrix[idx][similar_indices]
    lookalike_data[cust_id] = list(zip(similar_customers, similarity_scores))

### Save results

In [20]:
# Create a CSV file mapping each customer to their top 3 lookalikes
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_data.keys(),
    'Lookalikes': [str(v) for v in lookalike_data.values()]
})
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv generated successfully!")

Lookalike.csv generated successfully!


--------