In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
customers_path = '/content/Customers.csv'
transactions_path = '/content/Transactions.csv'
products_path = '/content/Products.csv'

In [None]:
customers_df = pd.read_csv(customers_path)
transactions_df = pd.read_csv(transactions_path)
products_df = pd.read_csv(products_path)

merged data to get 1) product details for each transaction and 2) customers- to get customer demographic data


In [None]:
transactions_products = pd.merge(transactions_df, products_df, on="ProductID")
customer_transactions = pd.merge(transactions_products, customers_df, on="CustomerID")

In [None]:
customer_profiles = customer_transactions.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    total_transactions=("TransactionID", "count"),
    avg_purchase_value=("TotalValue", "mean"),
    most_common_category=("Category", lambda x: x.mode()[0] if not x.mode().empty else None),
    region=("Region", "first"),
    signup_date=("SignupDate", "first")
).reset_index()

In [None]:
scaler = MinMaxScaler()
numeric_features = ["total_spent", "total_transactions", "avg_purchase_value"]
customer_profiles[numeric_features] = scaler.fit_transform(customer_profiles[numeric_features])


In [None]:
encoded_profiles = pd.get_dummies(customer_profiles, columns=["most_common_category", "region"], drop_first=True)
feature_columns = [col for col in encoded_profiles.columns if col not in ["CustomerID", "signup_date"]]
similarity_matrix = cosine_similarity(encoded_profiles[feature_columns])

In [None]:
customer_ids = customer_profiles["CustomerID"]
lookalike_map = {}

for i, customer_id in enumerate(customer_ids[:20]):
    similarity_scores = list(enumerate(similarity_matrix[i]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_ids[j], score) for j, score in sorted_scores[1:4]]
    lookalike_map[customer_id] = top_3

In [None]:
lookalike_data = []
for cust_id, lookalikes in lookalike_map.items():
    for similar_cust_id, score in lookalikes:
        lookalike_data.append({
            "CustomerID": cust_id,
            "SimilarCustomerID": similar_cust_id,
            "SimilarityScore": score
        })

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv")
print(lookalike_df.head())
print(lookalike_df.tail())

Lookalike.csv
  CustomerID SimilarCustomerID  SimilarityScore
0      C0001             C0190         0.999128
1      C0001             C0048         0.998555
2      C0001             C0039         0.997123
3      C0002             C0088         0.995626
4      C0002             C0134         0.994023
   CustomerID SimilarCustomerID  SimilarityScore
55      C0019             C0121         0.978723
56      C0019             C0017         0.970967
57      C0020             C0050         0.983987
58      C0020             C0026         0.970476
59      C0020             C0035         0.966214


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download("Lookalike.csv")