In [None]:

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import seaborn as sns
import matplotlib.pyplot as plt

# Load datasets
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Merge datasets
merged_df = transactions_df.merge(customers_df, on='CustomerID')

# Aggregate customer-level transaction data
customer_summary = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

# One-hot encode categorical features
customer_summary = customer_summary.merge(customers_df[['CustomerID', 'Region']], on='CustomerID')
customer_summary = pd.get_dummies(customer_summary, columns=['Region'], drop_first=True)

# Scale features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(customer_summary.drop(columns=['CustomerID']))

# Determine optimal number of clusters using the elbow method
inertia = []
db_index_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)
    db_index_scores.append(davies_bouldin_score(scaled_data, kmeans.labels_))

# Plot the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(range(2, 11), inertia, marker='o')
plt.title('Elbow Method for Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.grid()
plt.show()

# Fit KMeans with optimal cluster count
optimal_k = db_index_scores.index(min(db_index_scores)) + 2
final_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
customer_summary['Cluster'] = final_kmeans.fit_predict(scaled_data)

# Save clustered data
customer_summary.to_csv('Harpartap_Singh_Clustering.csv', index=False)

# Visualize clusters
sns.scatterplot(x=customer_summary['TotalValue'], y=customer_summary['Quantity'], hue=customer_summary['Cluster'], palette='viridis')
plt.title('Customer Segmentation')
plt.xlabel('Total Value of Purchases')
plt.ylabel('Total Quantity Purchased')
plt.show()
