In [2]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score, silhouette_samples, pair_confusion_matrix, plot_confusion_matrix
from mpl_toolkits import mplot3d

from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import scale, MinMaxScaler

random_state = 42 


In [3]:
def two_plots(x, y1, y2, xlabel, y1label, y2label):
    fig, ax1 = plt.subplots()

    color = 'tab:red'
    ax1.set_xlabel(xlabel)
    ax1.set_ylabel(y1label, color=color)
    ax1.plot(x, y1, color=color)
    ax1.tick_params(axis='y', labelcolor=color)
    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

    color = 'tab:blue'
    ax2.set_ylabel(y2label, color=color)  # we already handled the x-label with ax1
    ax2.plot(x, y2, color=color)
    ax2.tick_params(axis='y', labelcolor=color)
    ax2.set_ylim(0,1) # the axis for silhouette is [0,1]

    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.show()

#from plot_clusters import plot_clusters
def plot_clusters(X, y, dim, points,
                  labels_prefix = 'cluster', 
                  points_name = 'centroids',
                  colors = cm.tab10, # a qualitative map 
                      # https://matplotlib.org/examples/color/colormaps_reference.html
#                   colors = ['brown', 'orange', 'olive', 
#                             'green', 'cyan', 'blue', 
#                             'purple', 'pink'],
#                   points_color = 'red'
                  points_color = cm.tab10(10) # by default the last of the map (to be improved)
                 ):
    """
    Plot a two dimensional projection of an array of labelled points
    X:      array with at least two columns
    y:      vector of labels, length as number of rows in X
    dim:    the two columns to project, inside range of X columns, e.g. (0,1)
    points: additional points to plot as 'stars'
    labels_prefix: prefix to the labels for the legend ['cluster']
    points_name:   legend name for the additional points ['centroids']
    colors: a color map
    points_color: the color for the points
    @author: Claudio Sartori
    """
    # plot the labelled (colored) dataset and the points
    labels = np.unique(y)
    for i in range(len(labels)):
        color = colors(i / len(labels)) # choose a color from the map
        plt.scatter(X[y==labels[i],dim[0]], 
                    X[y==labels[i],dim[1]], 
                    s=10, 
                    c = [color], # scatter requires a sequence of colors
                    marker='s', 
                    label=labels_prefix+str(labels[i]))
    plt.scatter(points[:,dim[0]], 
                points[:,dim[1]], 
                s=50, 
                marker='*', 
                c=[points_color], 
                label=points_name)
    plt.legend()
    plt.grid()
    plt.show()  


def plot_silhouette(silhouette_vals, y, 
 					colors = cm.tab10,
 					plot_noise = False
					):
    """
    Plotting silhouette scores for the individual samples of a labelled data set.
    The scores will be grouped according to labels and sorted in descending order.
    The bars are proportional to the score and the color is determined by the label.
    
    silhouette_vals: the silhouette values of the samples
    y:               the labels of the samples
    plot_noise:      boolean, assumes the noise to be labeled with a negative integer
    @author: Claudio Sartori
    """
    cluster_labels = np.unique(y)
    if not plot_noise:
	    cluster_labels = cluster_labels[cluster_labels != -1]
    n_clusters = len(cluster_labels)
    y_ax_lower, y_ax_upper = 0, 0
    yticks = []
    for i, c in enumerate(cluster_labels): # generate pairs index, cluster_label
        c_silhouette_vals = silhouette_vals[y==c] # extracts records with the current cluster label
        c_silhouette_vals.sort() # sort the silhouette vals for the current class
        y_ax_upper += len(c_silhouette_vals)
        color = colors(i / n_clusters)
        plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, 
                edgecolor='none', color=color)
        yticks.append((y_ax_lower + y_ax_upper) / 2)
        c_silhouette_avg = np.mean(c_silhouette_vals)
        plt.axvline(c_silhouette_avg
         			, ymin = y_ax_lower/len(silhouette_vals)
         			, ymax = y_ax_upper/len(silhouette_vals)
        			, color=color, linestyle="-."
        			) 
        y_ax_lower += len(c_silhouette_vals)


    silhouette_avg = np.mean(silhouette_vals)
    plt.axvline(silhouette_avg, color="black", linestyle="--") 
    plt.yticks(yticks, cluster_labels)
    plt.ylabel('Cluster')
    plt.xlabel('Silhouette coefficient - Cluster means: -. Global mean: --')
    plt.tight_layout()
    # plt.savefig('./figures/silhouette.png', dpi=300)
    plt.show()

In [None]:
X_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00292/Wholesale%20customers%20data.csv'
delimiter = ','
df = pd.read_csv(X_url, delimiter=delimiter)
print(f"Shape of the input data {df.shape}")

In [None]:
from math import sqrt
X_sqrt = pd.concat([df.iloc[:,:2],df.iloc[:,2:].applymap(sqrt)],axis=1)

# remap on the 0:1 range with MinMaxScaler
mms = MinMaxScaler()
X = pd.DataFrame(mms.fit_transform(X_sqrt), columns = X_sqrt.columns)
X.head()

## KMeans

In [None]:
# using the elbow method to fid the optinmal number of clusters

k_range = list(range(2,11)) # set the range of k values to test 
parameters_km = [{'n_clusters': k_range}]
pg = list(ParameterGrid(parameters_km))
inertias_km = []
silhouette_scores_km = []
for i in range(len(pg)):
    km = KMeans(**(pg[i]), random_state=random_state)
    y_km = km.fit_predict(df)
    inertias_km.append(km.inertia_)
    silhouette_scores_km.append(silhouette_score(df, y_km))

two_plots(x=k_range, y1=inertias_km, y2=silhouette_scores_km
          , xlabel='Number of clusters', y1label='Inertias', y2label='Silhouette scores'
         )

In [None]:
# cluster with the optimal number (the one where the silhouette score is maximum)

k=3
km = KMeans(n_clusters=k, 
            random_state=random_state)
y_km = km.fit_predict(df)
print(f"Number of clusters = {k}\n\
    Distortion = {inertias_km[k_range.index(k)]:6.2f}\t- \
        Silhouette score = {silhouette_scores_km[k_range.index(k)]:4.2f}")

clust_sizes_km = np.unique(y_km,return_counts=True)
pd.DataFrame(clust_sizes_km[1]).plot.pie(y=0, autopct='%1.1f%%', );
plt.show()

# The __silhouette score__ ranges from `-1` (worst) to `1` (best); 
# as a rule of thumb, a value greater than `0.5` should be considered acceptable.

## Agglomerative clustering
We will try a grid of parameter configurations, with the number of clusters in the range `2:10` and the four linkage methods available in the *sklearn* implementation of *AgglomerativeClustering*. 

In [None]:
parameters = [{'n_clusters': k_range, \
    'linkage' : ['ward', 'complete', 'average', 'single']}]
pg = list(ParameterGrid(parameters))
result_ac = []
for i in range(len(pg)):
    ac = AgglomerativeClustering(**(pg[i]))
    y_ac = ac.fit_predict(df)
    result_ac.append([pg[i]['linkage'],pg[i]['n_clusters'],silhouette_score(df,y_ac)])

# showing the best results
df_result_ac = pd.DataFrame(data = result_ac, columns=['linkage','n_clusters','silhouette_score'])
df_result_ac.sort_values(by='silhouette_score', ascending=False).head(5)

In [None]:
# plot the 3d bar graph

from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
df_result_ac['linkage_enc'] = oe.fit_transform(df_result_ac['linkage'].values.reshape(-1,1))


import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

fig = plt.figure()
ax = Axes3D(fig)

# x, y = np.meshgrid(np.arange(0,len(data[0]),1) + 0.25, 
#     np.arange(0,len(data[:,0]),1) + + 0.25)

x,y = df_result_ac['linkage_enc'].values,df_result_ac['n_clusters'].values
bottom = np.zeros(df_result_ac.shape[0])


width = .5 * np.ones(df_result_ac.shape[0])#np.ones_like(zpos)
depth = .5 * np.ones(df_result_ac.shape[0])

ax.bar3d(x,y
         ,bottom,width,depth
         ,df_result_ac['silhouette_score'].values
         )
plt.show()

In [None]:
# show the result obtained with the best result
# The first 3 result have a similar silhouette score, so we choose the one 
# with the lower number of clusters (in the position 1)

pos = 1
print(df_result_ac.iloc[[pos]])
ac = AgglomerativeClustering(**(pg[pos]))
y_ac = ac.fit_predict(df)

# show the distribution of the clusters
clust_sizes_ac = np.unique(y_ac,return_counts=True)
pd.DataFrame(clust_sizes_ac[1]).plot.pie(y=0, autopct='%1.1f%%', );
plt.show()

In [None]:
# adding the prediction of the clustering models to the dataframe
df['cluster_km']=y_km
sns.pairplot(data=df, hue='cluster_km');
plt.show()

df['cluster_ac']=y_ac
sns.pairplot(data=df.drop('cluster_km',axis=1), hue='cluster_ac');
plt.show()

#### Confusion matrix
The function `pair_confusion_matrix` computes the number of pairs of objects that are in the same clusters or in different clusters in two different clustering schemes. 

The result is given in a 2x2 matrix, the perfect match is when only the numbers in the main diagonal are non zero.

We present here the results normalized to 1

In [None]:
pcm = pair_confusion_matrix(y_km,y_ac)
print(pcm / pcm.sum())
print(f"The percentage of match between the two clustering schemes is {((pcm / pcm.sum()).diagonal().sum()*100):6.2f}%")

# DBSCAN

In [None]:
# use this plot to see the most relevant column
sns.pairplot(pd.DataFrame(df))

In [None]:
X = df[:,[0,1]]
focus = [0,1]

# observe the most interesting columns
plt.scatter(X[:,focus[0]], X[:,focus[1]]
            , c='white'          # color filling the data markers
            , edgecolors='black' # edge color for data markers
            , marker='o'         # data marker shape, e.g. triangles (v<>^), square (s), star (*), ...
            , s=50)              # data marker size
plt.grid()  # plots a grid on the data
plt.show()

In [None]:
db = DBSCAN()
y_db = db.fit_predict(X)
print(f"The maximum distance between two samples for one to be considered as in the neighborhood of the other is {db.eps}\n\
    The number of samples in a neighborhood for a point to be considered as a core point is {db.min_samples}")

In [None]:
cluster_labels_all = np.unique(y_db)
cluster_labels = cluster_labels_all[cluster_labels_all != -1]
n_clusters = len(cluster_labels)
if cluster_labels_all[0] == -1:
    noise = True
    print("There is noise")
else:
    noise = False
print("There is/are {} cluster(s)".format(n_clusters))

cluster_centers = np.empty((n_clusters,X.shape[1]))
for i in cluster_labels:
    cluster_centers[i,:] = np.mean(X[y_db==i,:], axis = 0)
plot_clusters(X,y_db,dim=(focus[0],focus[1]), points = cluster_centers)

### Find the best parameters using `ParameterGrid`

In [None]:
param_grid = {'eps': list(np.arange(0.01, 1, 0.01)), 'min_samples': list(range(1,10,1))}
params = list(ParameterGrid(param_grid))

# Arrange DBSCAN results in a dataframe, for easier presentation and filtering
dbscan_out = pd.DataFrame(columns =  ['eps','min_samples','n_clusters','silhouette', 'unclust%'])
for i in range(len(params)):
    db = DBSCAN(**(params[i]))
    y_db = db.fit_predict(X)
    cluster_labels_all = np.unique(y_db)
    cluster_labels = cluster_labels_all[cluster_labels_all != -1]
    n_clusters = len(cluster_labels)
    if n_clusters > 1:
        X_cl = X[y_db!=-1,:]
        y_db_cl = y_db[y_db!=-1]
        silhouette = silhouette_score(X_cl,y_db_cl)
        uncl_p = (1 - y_db_cl.shape[0]/y_db.shape[0]) * 100
        dbscan_out.loc[len(dbscan_out)] = [db.eps, db.min_samples, n_clusters, silhouette, uncl_p]


sil_thr = 0.7  # visualize results only for combinations with silhouette above the threshold
unc_thr = 10 # visualize results only for combinations with unclustered below the threshold
n_clu_max_thr = 4
dbscan_out[(dbscan_out['silhouette']>=sil_thr)\
         & (dbscan_out['unclust%']<=unc_thr)\
         & (dbscan_out['n_clusters']<=n_clu_max_thr)]

- Observe visually the most promising combination of parameters.  
- Plot the clusters with the centers  
- Plot the silhouette indexs for all the clustered samples  

In [None]:
# db = DBSCAN(eps=0.9, min_samples=4)    #  no
# db = DBSCAN(eps=0.28, min_samples=9) # no
db = DBSCAN(eps=0.99, min_samples=9)
# db = DBSCAN(eps=0.05, min_samples=9)
# db = DBSCAN(eps=0.16, min_samples=9)
y_db = db.fit_predict(X)
cluster_labels_all = np.unique(y_db)
cluster_labels = cluster_labels_all[cluster_labels_all != -1]
n_clusters = len(cluster_labels)

cluster_centers = np.empty((n_clusters,X.shape[1]))
for i in cluster_labels:
    cluster_centers[i,:] = np.mean(X[y_db==i,:], axis = 0)

print("There are {} clusters".format(n_clusters))
print("The cluster labels are {}".format(cluster_labels))
print(f"Cluster centers: {cluster_centers}")

plot_clusters(X,y_db,dim=(focus[0],focus[1]), points = cluster_centers)

silhouette = silhouette_samples(X,y_db)
plot_silhouette(silhouette,y_db)

In [None]:
# A quick look to the width of data ranges
print(np.max(X, axis=0)-np.min(X,axis=0))


mms = MinMaxScaler()
Xs = mms.fit_transform(X)
Xs.max(axis=0)-Xs.min(axis=0)