<h2>Clustering: Unknown dataset<h2>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read the data
X = pd.read_csv("unknown.csv")
X

In [None]:
X.describe()

<b>K-means Clustering (Scikit-Learn)</b>

In [None]:
from sklearn.cluster import KMeans

# Select the number of clusters by means of SSE
SSE = []
range_n_clusters = range(2,20)
for nclust in range_n_clusters:
    # Initialize K-means clustering
    km = KMeans(n_clusters=nclust, 
                init='random', # or 'k-means++'
                n_init=10, 
                max_iter=300, 
                random_state=0)
    # Generate K-means clustering
    km.fit(X)
    SSE.append(km.inertia_)
    
plt.plot(range_n_clusters, SSE, marker='o')
plt.xlabel('Number of clusters K')
plt.ylabel('Sum of Squared Distances (SSE)')
plt.show()

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score

# Select the number of clusters by means of the silohuette coefficient
range_n_clusters = range(2,13)
for nclust in range_n_clusters:
    
    # Initialize K-means clustering
    km = KMeans(n_clusters=nclust,
                init='random',
                n_init=10, 
                max_iter=300,
                random_state=0)
    
    # Generate K-means clustering, compute cluster centers and predict the cluster label for each example
    cluster_labels = km.fit_predict(X)
    cluster_labels_set = np.unique(cluster_labels)
    
    # Compute the average silhouette coefficient
    silhouette_avg_coeff = silhouette_score(X, cluster_labels)
    print("N° of clusters =", nclust, "-> Average silhouette coefficient: ", silhouette_avg_coeff)
    
    # Compute the silhouette coefficient for each example
    silhouette_example_coeff = silhouette_samples(X, cluster_labels)
    
    # For the examples belonging to each cluster plot the silhouette coefficient
    fig = plt.figure(nclust) 
    fig.set_size_inches(6,6)
    ax1 = fig.add_subplot()
    
    y_lower = 0
    for i in cluster_labels_set:
        # Aggregate and sort the silhouette coefficients for the examples belonging to the cluster
        cluster_i_silhouette_coeff = silhouette_example_coeff[cluster_labels==i]
        cluster_i_silhouette_coeff.sort()
        y_upper = y_lower + cluster_i_silhouette_coeff.shape[0]
        ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_i_silhouette_coeff)
        ax1.text(-0.05, y_lower + 0.5 * cluster_i_silhouette_coeff.shape[0], str(i))
        y_lower = y_upper + 10
    
    # Enrich the silhouette plot
    ax1.axvline(x=silhouette_avg_coeff, color="black", linestyle="--")
    ax1.set_xlabel("Silhouette Coefficients")
    ax1.set_ylabel("Cluster Label")

In [None]:
# Generate the final K-means clustering model
from sklearn.cluster import KMeans

nclust=10

km = KMeans(n_clusters=10,
            init='random',
            n_init=10,
            random_state=0)
cluster_labels = km.fit_predict(X)
print("Done")

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
print(digits.DESCR)

In [None]:
# Plot the centers
fig, ax = plt.subplots(1, nclust, figsize=(12, 8))
centers = km.cluster_centers_.reshape(nclust, 8, 8)
for axi, center in zip(ax.flat, centers):
    axi.set(xticks=[], yticks=[])
    axi.imshow(center, interpolation='hamming', cmap=plt.cm.binary)

In [None]:
print(cluster_labels.min(),"-",cluster_labels.max())

In [None]:
d = {0: 3, 1: 8, 2: 2, 3: 6, 4: 4, 5: 7, 6: 1, 7: 0, 8: 9, 9: 5}

cluster_labels_new = cluster_labels.copy()
for i in range(len(cluster_labels)):
    print(f"The cluster label {cluster_labels_new[i]} is changed into {d[cluster_labels[i]]}")
    cluster_labels_new[i]=d[cluster_labels[i]]
cluster_labels_new
np.savetxt('myfile.csv', cluster_labels_new, delimiter=',')
print("Done")

In [None]:
# Let's compare the ground truth with the clustering results
GT = pd.read_csv("unknown_ground_truth.csv")
GT

In [None]:
from sklearn import metrics
metrics.confusion_matrix(GT,cluster_labels_new)