In [1]:
import pandas as pd

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster 

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import plotly.express as px

from Functions import dunn_index
from Functions import standardize

ModuleNotFoundError: No module named 'Functions'

In [None]:
df = pd.read_csv("./Dataset/Clean.csv")
df.head()

In [None]:
df = standardize(df)
df.head()

In [None]:
# Perform hierarchical clustering
linkage_matrix = linkage(df, method='ward', metric='euclidean', optimal_ordering=False)

# Plot the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(linkage_matrix, truncate_mode='lastp', p=6)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Cluster Size')
plt.ylabel('Distance')
plt.show()

In [None]:
# Cut the dendrogram, with distance threshold
# threshold_distance = 200000
n_clusters = 4
clusters = fcluster(linkage_matrix, n_clusters, criterion='maxclust')

# Add cluster labels
df['Cluster'] = clusters

In [None]:
# Check the resulting clusters
print(df[['Cluster']].head())
print(df['Cluster'].value_counts())
cluster_summary = df.groupby('Cluster').mean()

print(cluster_summary)

In [None]:
cluster_summary['BALANCE'].plot(kind='bar', title='Average Balance by Cluster', figsize=(10, 6))
plt.ylabel('Balance')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(cluster_summary.T, cmap='coolwarm', annot=True)
plt.title('Cluster Feature Averages')
plt.show()

In [None]:
dunn = dunn_index(df, clusters)
print(f"Dunn index: {dunn}")

In [None]:
pca = PCA(n_components=3)  # Reduce to 2 dimensions for visualization
pca_components = pca.fit_transform(df.drop(columns=['Cluster']))  # Exclude 'Cluster' column for PCA
pca_df = pd.DataFrame(pca_components, columns=['PC1', 'PC2', 'PC3'])
pca_df['Cluster'] = df['Cluster']  # Add the cluster labels

In [None]:
# Step 3: Create a 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(pca_df['PC1'], pca_df['PC2'], pca_df['PC3'], c=pca_df['Cluster'], cmap='viridis', s=100, alpha=0.7)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
plt.title('3D PCA of Clusters')
plt.legend(*scatter.legend_elements(), title="Clusters")
plt.show()

In [None]:
pca = PCA(n_components=3) 
pca_components = pca.fit_transform(df.drop(columns=['Cluster']))  # Remove 'Cluster' column for PCA

# Create a DataFrame with PCA components and cluster labels
pca_df = pd.DataFrame(pca_components, columns=['PC1', 'PC2', 'PC3'])
pca_df['Cluster'] = df['Cluster']  # Add the cluster labels

# Create the 3D scatter plot
fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3', color='Cluster', title="3D PCA of Clusters",
                    labels={'PC1': 'PC 1', 'PC2': 'PC 2', 'PC3': 'PC 3'},
                    color_continuous_scale='viridis')
fig.update_layout(
    width=800,
    height=600,  
    autosize=True,  
)

fig.show()