In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# Load data
customer_data = pd.read_csv("Customers.csv")
transaction_data = pd.read_csv("Transactions.csv")
product_data = pd.read_csv("Products.csv")

In [6]:
# Merge transaction and product data
transaction_product_data = pd.merge(transaction_data, product_data, on="ProductID", how="left")

In [7]:
# Aggregate transaction data per customer
customer_transaction_summary = transaction_product_data.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "Quantity": "sum"
}).reset_index()


In [8]:
# Merge customer data with transaction summary
customer_profile = pd.merge(customer_data, customer_transaction_summary, on="CustomerID", how="left")

In [9]:

# Fill NaN values for customers without transactions
customer_profile["TotalValue"].fillna(0, inplace=True)
customer_profile["Quantity"].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_profile["TotalValue"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_profile["Quantity"].fillna(0, inplace=True)


In [10]:
# Standardize numerical features
scaler = StandardScaler()
customer_profile[["TotalValue", "Quantity"]] = scaler.fit_transform(customer_profile[["TotalValue", "Quantity"]])

In [11]:
# Perform KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)
customer_profile['Cluster'] = kmeans.fit_predict(customer_profile[["TotalValue", "Quantity"]])

In [15]:
# Compute cosine similarity within clusters
# Compute cosine similarity within clusters
similarity_map = {}
for cluster in customer_profile['Cluster'].unique():
    cluster_data = customer_profile[customer_profile['Cluster'] == cluster]
    if len(cluster_data) > 1:
        # Compute cosine similarity for the cluster
        tfidf_matrix = cosine_similarity(cluster_data[["TotalValue", "Quantity"]])

        # Reset index to align DataFrame indices with similarity matrix
        cluster_data = cluster_data.reset_index()

        for idx, row in cluster_data.iterrows():
            # Get similarity scores for the current customer
            similarity_scores = list(enumerate(tfidf_matrix[idx]))
            similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

            # Get the top 3 similar customers (excluding the customer itself)
            similar_customers = [
                (cluster_data.iloc[i[0]]["CustomerID"], i[1])
                for i in similarity_scores[1:4]  # Exclude self (top 3)
            ]
            similarity_map[row["CustomerID"]] = similar_customers


In [16]:
# Generate Lookalike.csv
lookalike_df = pd.DataFrame({
    "CustomerID": similarity_map.keys(),
    "Lookalikes": [str(v) for v in similarity_map.values()]
})

In [17]:
lookalike_df.to_csv("Lookalike.csv", index=False)

Lookalike.csv generated successfully!


#1. Accuracy

In [18]:
mean_similarity = []
for customer, recommendations in similarity_map.items():
    similarities = [rec[1] for rec in recommendations]  # Extract scores
    mean_similarity.append(sum(similarities) / len(similarities))
print("Mean Similarity Score:", sum(mean_similarity) / len(mean_similarity))


Mean Similarity Score: 0.9955880031782885


#2. Clustering Performance
Assess the quality of KMeans clustering using metrics like inertia or silhouette score:

In [19]:
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(customer_profile[["TotalValue", "Quantity"]], customer_profile["Cluster"])
print("Silhouette Score:", silhouette_avg)


Silhouette Score: 0.4315131216122971
