In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import davies_bouldin_score

# Step 1: Load customer and transaction data
customers = pd.read_csv('Customers.csv', encoding='utf-8')
transactions = pd.read_csv('Transactions.csv', encoding='utf-8')

# Step 2: Data Preprocessing
# Aggregate transaction data by CustomerID: Calculate total spending and product diversity (unique products bought)
transaction_summary = transactions.groupby('CustomerID').agg({
    'Amount': 'sum',  # Total spending for each customer
    'ProductID': 'nunique',  # Number of unique products bought
}).reset_index()

# Step 3: Merge customer profile data with transaction data
# Assuming the 'Customers.csv' contains CustomerID, Age, Gender, and Location
customer_profiles = pd.merge(customers[['CustomerID', 'Age']], transaction_summary, on='CustomerID')

# Step 4: Feature Engineering
# Encoding categorical features like Gender and Location
customers_encoded = pd.get_dummies(customers[['Gender', 'Location']])

# Combine encoded features with numeric features (Age, Amount, ProductID)
customer_profiles = pd.concat([customer_profiles, customers_encoded], axis=1)

# Step 5: Data Standardization
# Standardizing the data to bring all features to the same scale
scaler = StandardScaler()
scaled_profiles = scaler.fit_transform(customer_profiles.drop('CustomerID', axis=1))

# Step 6: Clustering with KMeans
# We will perform KMeans clustering for different values of k (2 to 10 clusters)
db_scores = []
k_range = range(2, 11)

# Iterate over the range of clusters
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_profiles)
    cluster_labels = kmeans.labels_
    
    # Calculate Davies-Bouldin Index for the current clustering
    db_index = davies_bouldin_score(scaled_profiles, cluster_labels)
    db_scores.append(db_index)

# Step 7: Visualize Davies-Bouldin Index for each k
plt.figure(figsize=(10, 6))
plt.plot(k_range, db_scores, marker='o', linestyle='--', color='b')
plt.title("Davies-Bouldin Index for Different Numbers of Clusters")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Davies-Bouldin Index")
plt.xticks(k_range)
plt.grid(True)
plt.show()

# Step 8: Select the best k based on DB Index (lower is better)
best_k = k_range[np.argmin(db_scores)]
print(f"Optimal number of clusters based on DB Index: {best_k}")

# Step 9: Perform clustering with the optimal number of clusters
kmeans = KMeans(n_clusters=best_k, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_profiles)

# Step 10: Visualizing the clusters using PCA (2D projection)
pca = PCA(n_components=2)
pca_components = pca.fit_transform(scaled_profiles)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_components[:, 0], y=pca_components[:, 1], hue=cluster_labels, palette='viridis', s=100, edgecolor='k')
plt.title(f"Customer Segmentation with {best_k} Clusters")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title='Cluster')
plt.show()

# Step 11: Saving the clustering results
customer_profiles['Cluster'] = cluster_labels
customer_profiles['CustomerID'] = customers['CustomerID']
customer_profiles.to_csv('Customer_Segmentation_Results.csv', index=False)

# Step 12: Print a sample of the clustered data
customer_profiles.head()

# Step 13: Additional Metrics (Optional)
# Calculate other metrics like silhouette score (optional)
from sklearn.metrics import silhouette_score
sil_score = silhouette_score(scaled_profiles, cluster_labels)
print(f"Silhouette Score for the clustering: {sil_score:.3f}")
