# Sanket Aasabe Clustering

Customer Segmentation using Clustering Techniques.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA

## Load and Preprocess Data

In [None]:
# Load datasets
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Preprocess data
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

## Perform Customer Segmentation

In [None]:
# Merge datasets
merged_df = transactions_df.merge(customers_df, on='CustomerID', how='left')
customer_summary = merged_df.groupby('CustomerID').agg({
    'Region': 'first',
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

# Encode region
encoded_df = pd.get_dummies(customer_summary[['Region']])
encoded_df['TotalValue'] = customer_summary['TotalValue']
encoded_df['Quantity'] = customer_summary['Quantity']

# Normalize data
scaler = StandardScaler()
normalized_data = scaler.fit_transform(encoded_df)

# Perform clustering
k_values = range(2, 11)
db_scores = []
kmeans_models = {}

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(normalized_data)
    kmeans_models[k] = kmeans
    db_scores.append(davies_bouldin_score(normalized_data, kmeans.labels_))

# Optimal number of clusters
optimal_k = k_values[db_scores.index(min(db_scores))]
optimal_model = kmeans_models[optimal_k]
customer_summary['Cluster'] = optimal_model.labels_

# Visualize clusters
pca = PCA(n_components=2)
pca_data = pca.fit_transform(normalized_data)

plt.figure(figsize=(10, 7))
sns.scatterplot(x=pca_data[:, 0], y=pca_data[:, 1], hue=customer_summary['Cluster'], palette="tab10", s=50)
plt.title(f"Customer Clusters (Optimal K = {optimal_k})")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Save results
customer_summary[['CustomerID', 'Cluster']].to_csv('Sanket_Aasabe_Clustering.csv', index=False)
print(f"Customer Segmentation completed. Optimal Clusters: {optimal_k}, DB Index: {min(db_scores)}")