# Load the datasets

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Storing the Paths
customersFilePath = 'Datasets/Customers.csv'
productsFilePath = 'Datasets/Products.csv'
transactionsFilePath = 'Datasets/Transactions.csv'

In [5]:
# Load datasets
customers_df = pd.read_csv(customersFilePath)
products_df = pd.read_csv(productsFilePath)
transactions_df = pd.read_csv(transactionsFilePath)

# Merge datasets and rename columns to avoid conflicts

In [8]:
# Merge datasets and rename columns to avoid conflicts
merged_df = transactions_df.merge(customers_df, on="CustomerID").merge(products_df, on="ProductID")
merged_df = merged_df.rename(columns={"Price_x": "TransactionPrice", "Price_y": "ProductPrice"})

# Feature Engineering: Aggregate purchase history

In [9]:
customer_features = merged_df.groupby("CustomerID").agg({
    "TransactionPrice": "sum",  # Total money spent
    "ProductID": "nunique",  # Unique products purchased
    "Quantity": "sum"  # Total quantity bought
}).reset_index()

# Normalize data

In [10]:
scaler = StandardScaler()
customer_scaled = scaler.fit_transform(customer_features.iloc[:, 1:])

# Compute Similarity Matrix


In [11]:
similarity_matrix = cosine_similarity(customer_scaled)
customer_ids = customer_features["CustomerID"].tolist()

In [12]:
# Function to get top-N similar customers
def get_top_similar(customer_id, top_n=3):
    idx = customer_ids.index(customer_id)
    scores = list(enumerate(similarity_matrix[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:top_n+1]  # Exclude self
    return [(customer_ids[i], round(score, 4)) for i, score in scores]

# Generate Lookalike.csv

In [15]:

lookalike_dict = {cust: get_top_similar(cust) for cust in customer_ids[:20]}
lookalike_df = pd.DataFrame(list(lookalike_dict.items()), columns=["CustomerID", "SimilarCustomers"])
lookalike_df.to_csv("Mrigaank_Jaswal_Lookalike.csv", index=False)

In [16]:
# Print sample output
print(lookalike_df.head())

  CustomerID                                   SimilarCustomers
0      C0001  [(C0191, 0.9782), (C0137, 0.972), (C0069, 0.95...
1      C0002   [(C0029, 1.0), (C0031, 0.9945), (C0035, 0.9944)]
2      C0003  [(C0010, 0.95), (C0176, 0.9397), (C0027, 0.9285)]
3      C0004  [(C0105, 0.9985), (C0057, 0.9978), (C0109, 0.9...
4      C0005  [(C0058, 0.9998), (C0123, 0.9996), (C0128, 0.9...
