In [2]:
!pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.1 MB 3.3 MB/s eta 0:00:04
   ----- ---------------------------------- 1.6/11.1 MB 2.9 MB/s eta 0:00:04
   ------ --------------------------------- 1.8/11.1 MB 2.8 MB/s eta 0:00:04
   --------- ------------------------------ 2.6/11.1 MB 2.7 MB/s eta 0:00:04
   -------------- ------------------------- 3.9/11.1 MB 3.4 MB/s eta 0:00:03
   

In [3]:
# Importing libraries for data analysis and model development.
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [4]:
# Load the datasets into Pandas DataFrames.
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [5]:
# Merge transactions with customers and products.
data = pd.merge(transactions, customers, on="CustomerID", how="left")
data = pd.merge(data, products, on="ProductID", how="left")


In [6]:
# Aggregate transaction history to create customer-level features.
customer_features = data.groupby("CustomerID").agg({
    "TotalValue": ["sum", "mean"],
    "Quantity": "sum",
    "ProductID": lambda x: x.nunique(),  # Number of unique products purchased.
}).reset_index()
customer_features.columns = ["CustomerID", "TotalSpend", "AvgSpend", "TotalQuantity", "UniqueProducts"]


In [7]:
# Merge with customer demographic information.
customer_data = pd.merge(customers, customer_features, on="CustomerID", how="left")


In [8]:
# Fill missing values with 0 (if any).
customer_data.fillna(0, inplace=True)


In [9]:
# Select relevant features for similarity calculation.
features = ["TotalSpend", "AvgSpend", "TotalQuantity", "UniqueProducts"]


In [10]:
# Standardize numerical features.
scaler = StandardScaler()
customer_data[features] = scaler.fit_transform(customer_data[features])


In [11]:
# Compute cosine similarity between customers.
similarity_matrix = cosine_similarity(customer_data[features])
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data["CustomerID"], columns=customer_data["CustomerID"])


In [12]:
# Function to get top 3 similar customers for each customer.
def get_top_n_similar(customers_df, n=3):
    lookalikes = {}
    for customer_id in customers_df.index:
        # Sort similarity scores in descending order, exclude the customer itself.
        similar_customers = customers_df.loc[customer_id].sort_values(ascending=False).iloc[1:n+1]
        lookalikes[customer_id] = list(zip(similar_customers.index, similar_customers.values))
    return lookalikes

In [13]:
# Get top 3 lookalikes for the first 20 customers.
lookalikes = get_top_n_similar(similarity_df.loc["C0001":"C0020"], n=3)


In [14]:
# Create Lookalike.csv file.
lookalike_records = []
for cust_id, similar_list in lookalikes.items():
    lookalike_records.append({
        "CustomerID": cust_id,
        "Lookalikes": similar_list
    })
lookalike_df = pd.DataFrame(lookalike_records)
lookalike_df.to_csv("Lookalike.csv", index=False)


In [15]:
# Display a sample of the lookalike recommendations.
print("Sample Lookalike Recommendations:")
print(lookalike_df.head())

Sample Lookalike Recommendations:
  CustomerID                                         Lookalikes
0      C0001  [(C0137, 0.9468620790818009), (C0164, 0.945373...
1      C0002  [(C0029, 0.9995348079416365), (C0031, 0.997777...
2      C0003  [(C0027, 0.8303038396808775), (C0176, 0.824671...
3      C0004  [(C0075, 0.9975376587528684), (C0195, 0.993661...
4      C0005  [(C0123, 0.9986331212213941), (C0063, 0.998165...
