In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
# ------------------------------------------------------------------
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage
from tabulate import tabulate
# ------------------------------------------------------------------
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import AgglomerativeClustering
# ------------------------------------------------------------------
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [None]:
df = pd.read_csv("C:/Users/Hali/Documents/Python/Mall_Customers_preprocessed.csv")
X = df.drop(columns=["customerid"])

In [None]:
data = df.drop(columns=["customerid",],axis=1,)
X = data.copy()

In [None]:
print(f"Mall_Customers_preprocessed")
print(tabulate(X.head(), headers='keys', tablefmt='grid'))

In [None]:
linked = linkage(X, method='ward')

In [None]:
#Vẽ biểu đồ Dendrogram - Thể hiện khoảng cách các cụm
plt.figure(figsize=(12, 6))
dendrogram(linked,
           orientation='top',
           distance_sort='descending',
           show_leaf_counts=False)
plt.title("Dendrogram - Agglomerative")
plt.xlabel("CustomerID")
plt.ylabel("Distance")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#Thực hiện Agglomerative
agg = AgglomerativeClustering(n_clusters=6, linkage='ward') #Số lượng cụm đã được Optimize
agg_labels = agg.fit_predict(X)

In [None]:
n_clusters_agg = len(np.unique(agg_labels))
silhouette_agg = silhouette_score(X, agg_labels)
davies_bouldin_agg = davies_bouldin_score(X, agg_labels)
calinski_harabasz_agg = calinski_harabasz_score(X, agg_labels)
print(f"Agglomerative:")
print(f"- Số lượng cụm: {n_clusters_agg}")
print(f"- Silhouette Score: {silhouette_agg:.3f}")
print(f"- Davies Bouldin: {davies_bouldin_agg:.3f}")
print(f"- Calinski Harabaz: {calinski_harabasz_agg:.3f}")

In [None]:
#Giảm số chiều về 2
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

In [None]:
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=agg_labels, cmap='rainbow', s=50)
plt.title("Agglomerative")
plt.grid(True)
plt.colorbar(scatter, label='Cluster')
plt.tight_layout()
plt.show()

In [None]:
#Thực hiện Affinity Propagation Default
ap = AffinityPropagation()
ap_labels = ap.fit_predict(X)

In [None]:
n_clusters_ap = len(np.unique(ap_labels))
silhouette_ap = silhouette_score(X, ap_labels)
davies_bouldin_ap = davies_bouldin_score(X, ap_labels)
calinski_harabasz_ap = calinski_harabasz_score(X, ap_labels)

In [None]:
print(f"Affinity Propagation:")
print(f"- Số lượng cụm: {n_clusters_ap}")
print(f"- Silhouette Score: {silhouette_ap:.3f}")
print(f"- Davies Bouldin: {davies_bouldin_ap:.3f}")
print(f"- Calinski Harabaz: {calinski_harabasz_ap:.3f}")

In [None]:
#Giảm số chiều về 2
X_2d = PCA(n_components=2).fit_transform(X)
labels = ap.labels_
exemplars = ap.cluster_centers_indices_

In [None]:
plt.figure(figsize=(15, 6))
for cluster in set(labels):
    plt.scatter(
        X_2d[labels == cluster, 0],
        X_2d[labels == cluster, 1],
        label=f"Cluster {cluster}",
    )

In [None]:
plt.scatter(
    X_2d[exemplars, 0],
    X_2d[exemplars, 1],
    c="black",
    marker="x",
    s=250,
    label="Exemplars",
    edgecolors="white",
)

In [None]:
plt.legend()
plt.title("Affinity Propagation")
plt.show()

In [None]:
#Bảng so sánh:
results = pd.DataFrame([
    {
        "Thuật toán": "Affinity Propagation",
        "Số cụm": n_clusters_ap,
        "Silhouette": round(silhouette_ap, 3),
        "Davies-Bouldin": round(davies_bouldin_ap, 3),
        "Calinski-Harabasz": round(calinski_harabasz_ap, 3)
    },
    {
        "Thuật toán": "Agglomerative",
        "Số cụm": 4,
        "Silhouette": round(silhouette_agg, 3),
        "Davies-Bouldin": round(davies_bouldin_agg, 3),
        "Calinski-Harabasz": round(calinski_harabasz_agg, 3)
    }
])

In [None]:
# In bảng so sánh
print("\nBảng so sánh kết quả phân cụm:")
print(tabulate(results, headers='keys', tablefmt='grid'))