In [None]:
# Intrinsic Clustering:-Intrinsic Measures: These measures do not require ground truth labels (applicable to all unsupervised 
# learning result

In [1]:
# Silhouette Coefficient
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def calculate_silhouette_coefficient(data, num_clusters):
    # Perform clustering using K-means
    kmeans = KMeans(n_clusters=num_clusters)
    labels = kmeans.fit_predict(data)
    
    # Calculate the Silhouette Coefficient
    silhouette_coefficient = silhouette_score(data, labels)
    
    return silhouette_coefficient

# Example usage
data = [[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]]
num_clusters = 2

silhouette_coefficient = calculate_silhouette_coefficient(data, num_clusters)
print("Silhouette Coefficient:", silhouette_coefficient)

Silhouette Coefficient: 0.2871407974806454


In [5]:
kmeans = KMeans(n_clusters=num_clusters)
labels = kmeans.fit_predict(data)
labels

array([0, 0, 0, 1, 1, 1])

In [None]:
# Davies-Bouldin Index: It measures the average similarity between clusters, considering both the compactness and separation. 
#     Lower values indicate better-defined clusters.

In [1]:
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import KMeans
data = [
    [5.1, 3.5, 1.4, 0.2],
    [4.9, 3. , 1.4, 0.2],
    [4.7, 3.2, 1.3, 0.2],
    [4.6, 3.1, 1.5, 0.2],
    [5. , 3.6, 1.4, 0.2],
    [5.4, 3.9, 1.7, 0.4],
]
num_clusters=3
kmeans = KMeans(n_clusters=num_clusters)
labels = kmeans.fit_predict(data)
labels

DB = davies_bouldin_score(data, labels)
DB

  super()._check_params_vs_input(X, default_n_init=10)


0.3412044467949961

In [2]:
labels

array([1, 0, 0, 0, 1, 2])

In [None]:
# Calinski-Harabasz Index measures the between-cluster dispersion against within-cluster dispersion. A higher score signifies 
# better-defined clusters.

# The Calinski-Harabasz Index, or Variance Ratio Criterion, measures the sum of between-cluster dispersion against the sum of
# within-cluster dispersion, where dispersion is the sum of distance squared.

# A higher ratio signifies the cluster is far away from its other clusters and that the cluster is more well-defined. The 
# formula is found in this article’s Appendix (Fig 9).

In [3]:
from sklearn.metrics import calinski_harabasz_score
data = [
    [5.1, 3.5, 1.4, 0.2],
    [4.9, 3. , 1.4, 0.2],
    [4.7, 3.2, 1.3, 0.2],
    [4.6, 3.1, 1.5, 0.2],
    [5. , 3.6, 1.4, 0.2],
    [5.4, 3.9, 1.7, 0.4],
]
num_clusters=3
kmeans = KMeans(n_clusters=num_clusters)
clusters = kmeans.fit_predict(data)
# clusters = [1, 1, 2, 2, 3, 3]

s = calinski_harabasz_score(data, clusters)
s

  super()._check_params_vs_input(X, default_n_init=10)


16.060344827586167

In [4]:
clusters

array([0, 1, 1, 1, 0, 2])

In [None]:
# Intrinsic Evaluation:
# Intrinsic evaluation assesses the quality of a clustering algorithm based on the characteristics of the clusters themselves. It 
# focuses on internal measures and does not rely on external information or ground truth labels. Some commonly used intrinsic 

# evaluation metrics are:

# Silhouette Coefficient: Measures the compactness and separation of clusters.
# Calinski-Harabasz Index: Evaluates the ratio of between-cluster dispersion to within-cluster dispersion.
# Davies-Bouldin Index: Quantifies the average similarity between clusters.
# These metrics provide insights into the quality and coherence of the clusters generated by the algorithm.

# Extrinsic Evaluation:
# Extrinsic evaluation involves assessing the performance of a clustering algorithm by comparing the clustering results with 
# externally available information or ground truth labels. In this approach, the quality of the clusters is evaluated based on
# how well they align with the known labels or external information. Some commonly used extrinsic evaluation metrics are:

# Adjusted Rand Index (ARI): Measures the similarity between two clusterings, taking into account all pairs of samples and their 
# agreements.
# Normalized Mutual Information (NMI): Quantifies the mutual information between two clusterings, considering the class labels.
# These metrics require having access to ground truth labels or external information to assess the algorithm's performance.

In [1]:
from sklearn.metrics import adjusted_rand_score

# True labels or ground truth
true_labels = [0, 0, 1, 1, 2, 2]

# Predicted labels from a clustering algorithm
num_clusters=3
kmeans = KMeans(n_clusters=num_clusters)
predicted_labels= kmeans.fit_predict(data)
# predicted_labels = [0, 0, 1, 1, 3, 3]

# Calculate the Adjusted Rand Index
ari = adjusted_rand_score(true_labels, predicted_labels)
print(ari)

1.0


In [2]:
from sklearn.metrics import normalized_mutual_info_score

# True labels or ground truth
true_labels = [0, 0, 1, 1, 2, 2]

# Predicted labels from a clustering algorithm
num_clusters=3
kmeans = KMeans(n_clusters=num_clusters)
predicted_labels = kmeans.fit_predict(data)
# predicted_labels = [0, 0, 1, 1, 3, 3]

# Calculate the Normalized Mutual Information
nmi = normalized_mutual_info_score(true_labels, predicted_labels)
print(nmi)

1.0
