In [57]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler



In [58]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [59]:
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")


In [60]:
agg_features = data.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "TransactionID": "count",
    "Category": lambda x: x.value_counts().to_dict()
}).reset_index()
agg_features.columns = ["CustomerID", "TotalPurchaseValue", "TransactionCount", "CategoryPreferences"]


In [61]:
scaler = StandardScaler()
numeric_features = agg_features[["TotalPurchaseValue", "TransactionCount"]]
scaled_features = scaler.fit_transform(numeric_features)


In [62]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

In [63]:
lookalikes = {}
for idx, customer_id in enumerate(agg_features["CustomerID"][:20]):
    similarity_scores = similarity_matrix[idx]
    top_indices = np.argsort(similarity_scores)[-4:-1][::-1]
    top_customers = [(agg_features.iloc[i]["CustomerID"], similarity_scores[i]) for i in top_indices]
    lookalikes[customer_id] = top_customers


In [64]:
# Save output to CSV
lookalike_df = pd.DataFrame({
    "cust_id": lookalikes.keys(),
    "lookalikes": [str(value) for value in lookalikes.values()]
})
lookalike_df.to_csv("Balkrishna_Joshi_Lookalike.csv", index=False)
