In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [4]:
# Convert dates to datetime for better processing
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [5]:
# Merge datasets for comprehensive analysis
merged = pd.merge(transactions, customers, on="CustomerID", how="left")
merged = pd.merge(merged, products, on="ProductID", how="left")

In [6]:
merged.rename(columns={'Price_y': 'Price'}, inplace=True)


In [7]:
merged.drop(columns=['Price_x'], inplace=True)


In [8]:
customer_features = merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'TransactionID': 'count',  # Total transactions
    'Price': 'mean',  # Average price per product
    'Region': lambda x: x.mode()[0] if len(x) > 0 else None,  # Region
    'Category': lambda x: x.mode()[0] if len(x) > 0 else None  # Most purchased category
}).reset_index()

In [9]:
# One-hot encode categorical features like 'Region' and 'Category'
customer_features = pd.get_dummies(customer_features, columns=['Region', 'Category'], prefix=['Region', 'Category'])

In [10]:
# Normalize numerical features for similarity calculation
scaler = StandardScaler()
feature_columns = [col for col in customer_features.columns if col not in ['CustomerID']]
normalized_features = scaler.fit_transform(customer_features[feature_columns])
customer_features_scaled = pd.DataFrame(normalized_features, columns=feature_columns)
customer_features_scaled['CustomerID'] = customer_features['CustomerID']

In [11]:
# Compute Cosine Similarity for customers
similarity_matrix = cosine_similarity(customer_features_scaled[feature_columns])
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Get Top 3 Lookalikes for C0001-C0020
lookalikes = {}
for customer_id in customers['CustomerID'][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Skip self-similarity
    lookalikes[customer_id] = list(zip(similar_customers.index, similar_customers.values))


In [12]:

# Save Lookalikes to CSV
lookalikes_df = pd.DataFrame([
    {"cust_id": cust_id, "lookalikes": str(similars)} for cust_id, similars in lookalikes.items()
])
lookalikes_df.to_csv("Naman_Chopra_Lookalike.csv", index=False)

# Output results
print("Top 3 Lookalikes for C0001-C0020 saved in Lookalike.csv")

Top 3 Lookalikes for C0001-C0020 saved in Lookalike.csv
