# Data Clustering

### Settings & Importing Libraries

In [None]:
#--0.0---------------------  SETTINGS -----------------------------------------
"""
Data Settings & Importing Libraries
"""

import pandas as pd
import numpy as np
import pylab as pl
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
import seaborn as sns
from scipy import stats
import sys
import os
#!conda install --yes --prefix {sys.prefix} plotly
#import plotly.graph_objects as go
from sklearn import cluster
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
#import plotly.io as pio
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist, squareform
from scipy.stats import mode
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.cluster import AgglomerativeClustering
from pandas.plotting import parallel_coordinates
from sklearn.decomposition import PCA
from pandas.plotting import scatter_matrix
from mpl_toolkits.mplot3d import Axes3D



%matplotlib inline 

pd.set_option('display.float_format', lambda x: '%.2f' % x)

plt.rcParams["figure.figsize"] = [30, 10]
plt.style.use('Solarize_Light2')

### Loading the data set

In [None]:
df_dropped = pd.read_csv(dir + './df_dropped.csv')
vs_num = pd.read_csv(dir + '\\Data\\vs_num.csv', index_col=0)
vs_cat = pd.read_csv(dir + '\\Data\\vs_cat.csv', index_col=0)

### Data Statistics

In [None]:
df_dropped.info()
print("---------------------------------------------------------------")
print(df_dropped.describe(include='all'))
print("---------------------------------------------------------------")
print(df_dropped.head())
print("---------------------------------------------------------------")

In [None]:
vs_num.info()
print("---------------------------------------------------------------")
print(vs_num.describe(include='all'))
print("---------------------------------------------------------------")
print(vs_num.head())
print("---------------------------------------------------------------")

In [None]:
vs_cat.info()
print("---------------------------------------------------------------")
print(vs_cat.describe(include='all'))
print("---------------------------------------------------------------")
print(vs_cat.head())
print("---------------------------------------------------------------")

#### Post operations

In [None]:
# --------------------------- df_dropped dataset
# conversion 
df_dropped['id_ram'] = df_dropped['id_ram'].astype(object)
df_dropped['memory_dim'] = df_dropped['memory_dim'].astype(object)
df_dropped['clock'] = df_dropped['clock'].astype(object)
df_dropped['time_code'] = pd.to_datetime(df_dropped['time_code'], format='%Y-%m-%d')


#   Attributes split by type
num_float = ['sales_usd', 'conversion_rate']
cat = ["clock", "memory_dim", 'id_ram', 'time_code', 'brand', 'ram_model', 'memory_type', 'vendor', 'continent', 'country', 'region', "currency"]

#   per vedere i valori distinti - USARE ATTRIBUTI CATEGORICI
for col in cat: 
    print("\nDistinct values in " + col + " : \t", df_dropped[col].unique())


# --------------------------- vs_num dataset
# conversion 
for x in vs_num:
    vs_num[x] = vs_num[x].astype(float)

### Normalization

In [None]:
scaler = MinMaxScaler()
temp_list = vs_num.columns
vs_num_norm = vs_num.copy()
vs_num_norm[temp_list] = scaler.fit_transform(vs_num[temp_list].values)
#print(vs_num_norm)

## Analysis by K-means clustering

### Best value of k

#### Elbow method

In [None]:
SSE = []
SIL = []
SEP = []
CAL = []

for k in range(2, 13):        
    k_means = cluster.KMeans(n_clusters=k, max_iter=100, random_state=1)   
    cluster_labels = k_means.fit_predict(vs_num_norm)
    k_means = k_means.fit(vs_num_norm)
    labels = k_means.labels_
    centroids = k_means.cluster_centers_

    SSE.append(k_means.inertia_)
    SIL.append(silhouette_score(vs_num_norm, cluster_labels, metric = 'euclidean'))
    print("For n_clusters =", k,
          "The average silhouette_score is :", silhouette_score(vs_num_norm, cluster_labels, metric = 'euclidean'))    
    SEP.append(davies_bouldin_score(vs_num_norm, cluster_labels))
    CAL.append(calinski_harabasz_score(vs_num_norm, cluster_labels))

plt.plot(range(2, 13), SSE)
plt.xlabel('Number of Clusters')
plt.ylabel('SSE')
plt.savefig(dir + "\\Clustering\\KMeans\\sse.jpg")
plt.show()

plt.plot(range(2, 13), SIL)
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.savefig(dir + "\\Clustering\\KMeans\\silhouette.jpg")
plt.show()

plt.plot(range(2, 13), SEP)
plt.xlabel('Number of Clusters')
plt.ylabel('Davies Bouldin Index')
plt.savefig(dir + "\\Clustering\\KMeans\\davies-bouldin.jpg")
plt.show()

plt.plot(range(2, 13), CAL)
plt.xlabel('Number of Clusters')
plt.ylabel('Calinski-Harabasz Index')
plt.savefig(dir + "\\Clustering\\KMeans\\calinski.jpg")
plt.show()



In [None]:
#--------------- K = 3 --------------------
k_means = cluster.KMeans(n_clusters=3, max_iter=100, random_state=1)   
cluster_labels = k_means.fit_predict(vs_num_norm)
k_means = k_means.fit(vs_num_norm)
labels = k_means.labels_
centroids = k_means.cluster_centers_
        
print(np.unique(labels, return_counts=True))

"""for x in vs_num_norm.columns:
    for y in vs_num_norm.columns:
        for z in vs_num_norm.columns:
            if (x != y) & (x != z) & (y != z):
                plt.figure(figsize=(20, 15))
                fig = go.Figure(data=[go.Scatter3d(
                    x=vs_num_norm[x],
                    y=vs_num_norm[y],
                    z=vs_num_norm[z],
                    mode='markers',
                    marker=dict(
                        size=6,
                        color=labels, 
                        opacity=1
                    )
                )])
                # tight layout
                fig.update_layout(margin=dict(l=0, r=0, b=0, t=0),
                                scene = dict(
                                    xaxis_title= x,
                                    yaxis_title= y,
                                    zaxis_title= z)
                                )
                #fig.write_image(dir + "\\Clustering\\KMeans\\3d-after-kmeans.jpg")
                fig.show()"""


plt.figure(figsize=(30, 12))
plt.tick_params(axis='both', which='major', labelsize=22)
plt.xticks(range(0, len(vs_num_norm)), vs_num_norm, fontsize=20)
for i in range(0, len(centroids)):
    plt.plot(centroids[i], marker='o', label='Cluster %s' % i)
plt.legend(fontsize=18)
plt.savefig(dir + "\\Clustering\\KMeans\\kmeans-parallel-coordinates.jpg")
plt.show()

for x in vs_num_norm.columns:
        plt.plot(legend='None')
        var_val_xt = pd.crosstab(labels, vs_num[x])
        var_val_xt_pct = \
            var_val_xt.div(var_val_xt.sum(1).astype('float'), axis=0)
        var_val_xt_pct.plot(kind='bar', stacked=True, figsize = (20,15), fontsize=(4))
        plt.title(x + ' by clusters')
        plt.ylabel(x)
        plt.xlabel('Clusters')
        plt.savefig(dir + "\\Clustering\\KMeans\\kmeans-crosstab-clustersby" + x + ".jpg")
        plt.show()

In [None]:
#--------------- K = 9 --------------------
k_means = cluster.KMeans(n_clusters=9, max_iter=100, random_state=1)   
cluster_labels = k_means.fit_predict(vs_num_norm)
k_means = k_means.fit(vs_num_norm)
labels = k_means.labels_
centroids = k_means.cluster_centers_
        
print(np.unique(labels, return_counts=True))

"""for x in vs_num_norm.columns:
    for y in vs_num_norm.columns:
        for z in vs_num_norm.columns:
            if (x != y) & (x != z) & (y != z):
                plt.figure(figsize=(20, 15))
                fig = go.Figure(data=[go.Scatter3d(
                    x=vs_num_norm[x],
                    y=vs_num_norm[y],
                    z=vs_num_norm[z],
                    mode='markers',
                    marker=dict(
                        size=6,
                        color=labels, 
                        opacity=1
                    )
                )])
                # tight layout
                fig.update_layout(margin=dict(l=0, r=0, b=0, t=0),
                                scene = dict(
                                    xaxis_title= x,
                                    yaxis_title= y,
                                    zaxis_title= z)
                                )
                #fig.write_image(dir + "\\Clustering\\KMeans\\3d-after-kmeans.jpg")
                fig.show()"""


plt.figure(figsize=(30, 12))
plt.tick_params(axis='both', which='major', labelsize=22)
plt.xticks(range(0, len(vs_num_norm)), vs_num_norm, fontsize=20)
for i in range(0, len(centroids)):
    plt.plot(centroids[i], marker='o', label='Cluster %s' % i)
plt.legend(fontsize=18)
plt.savefig(dir + "\\Clustering\\KMeans\\kmeans-parallel-coordinates.jpg")
plt.show()

for x in vs_num_norm.columns:
        plt.plot(legend='None')
        var_val_xt = pd.crosstab(labels, vs_num[x])
        var_val_xt_pct = \
            var_val_xt.div(var_val_xt.sum(1).astype('float'), axis=0)
        var_val_xt_pct.plot(kind='bar', stacked=True, figsize = (20,15), fontsize=(4))
        plt.title(x + ' by clusters')
        plt.ylabel(x)
        plt.xlabel('Clusters')
        plt.savefig(dir + "\\Clustering\\KMeans\\kmeans-crosstab-clustersby" + x + ".jpg")
        plt.show()

Insert Hierarchical Clustering (Ward method)

Comparison and evaluation of the different clustering via internal metrics

Regarding the Sum of Squared Errors (SSE): - a decrease in the SSE value proportional to
the number of clusters is an expected behavior, therefore opting directly for the clustering with the
lowest sum of squared distances may not be a worthwhile decision.
Regarding the Davies Bouldini Index: - a lower Davies-Bouldin index relates to a model with
better separation between the clusters and, in this regard, the clustering with k equals to 2 seems
to present the best separation among its clusters.
Regarding the Silhouette Score: - a higher value for the Silhouette Coefficient relates to a model
with better defined clusters, in this regard, the clustering with k equals to 2 presents the best
score.
Regarding the Calinski-Harabasz Index: - similarly to the Silhouette Coefficient, a higher value
for the Calinski-Harabasz score relates to a model with better defined clusters, in this regard, the
clustering with k equals to 2 seems to present the best defined clusters.

Outliers

parallel coordinates and radar plot to show most influential attributes

comparison results with external/internal indexes

## Analysis by density-based clustering

In [None]:
#-------  DBSCAN-------------

#----knee method--------
dist = pdist(df_norm[norm_attr], 'euclidean')
dist = squareform(dist)
for i in range(2, 10):
    k = math.pow(2, i)
    kth_distances = []
    for d in dist:
        index_kth_distance = np.argsort(d)[round(k)]
        kth_distances.append(d[index_kth_distance])
    plt.plot(range(0, len(kth_distances)), sorted(kth_distances))
plt.ylabel('Dist eps', fontsize=18)
plt.xlabel('Sorted distances', fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.savefig('D:\\Dropbox\\Scuola\\Pisa\\Anno2\\Data Mining\\Esame\\Pratica\\python\\Clustering\\DBSCAN\\eps.jpg')
plt.show()

res = []
eps = []
p=0.35
for i in range(0, 6):    #0.35-0.65
    eps.append(p)
    p+=0.05
    
for i in eps:
    for n in range (7, df.shape[1]*2):     #2000rows -> ln= 7
        dbscan = DBSCAN(eps=i, min_samples=n).fit(df_norm[norm_attr])
        core_samples_mask = np.zeros_like(dbscan.labels_, dtype=bool)
        core_samples_mask[dbscan.core_sample_indices_] = True
        #remove -1 for outliers
        #dbscan.labels_ = set([label for label in dbscan.labels_ if label >= 0])
        if len(np.unique(dbscan.labels_, return_counts=True)[0]) <= 1:
            continue
        print(np.unique(dbscan.labels_, return_counts=True))
        
        res.append({
            'label': len(np.unique(dbscan.labels_, return_counts=True)[0])-1,
            'sil': silhouette_score(df_norm[norm_attr], dbscan.labels_),
            'ms': n,
            'eps': i
        })


#print(*res, sep = "\n")   #better values are: label=>4, sil(max)=-0.25, ms=18or20, eps=0.35

dbscan = DBSCAN(eps=0.35, min_samples=20).fit(df_norm[norm_attr])
core_samples_mask = np.zeros_like(dbscan.labels_, dtype=bool)
core_samples_mask[dbscan.core_sample_indices_] = True
print("Number of cluster from DBSCAN is: ", len(np.unique(dbscan.labels_, return_counts=True)[0])-1)
print(np.unique(dbscan.labels_, return_counts=True))
plt.figure(figsize=(18, 10))
plt.scatter(df['pc'], df['clock_speed'], c=dbscan.labels_)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.savefig('D:\\Dropbox\\Scuola\\Pisa\\Anno2\\Data Mining\\Esame\\Pratica\\python\\Clustering\\DBSCAN\\dbscan-scatter.jpg')
plt.show()

x = 'clock_speed'
y = 'n_cores'
z = 'price_range'

fig = plt.figure(figsize=(16,12))
fig = go.Figure(data=[go.Scatter3d(
    x=df_norm[x],
    y=df_norm[y],
    z=df_norm[z],
    mode='markers',
    marker=dict(
        size=5,
        color=dbscan.labels_, 
        opacity=1
    )
)])
# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0),
            scene = dict(
                xaxis_title= x,
                yaxis_title= y,
                zaxis_title= z)
            )
fig.write_image('D:\\Dropbox\\Scuola\\Pisa\\Anno2\\Data Mining\\Esame\\Pratica\\python\\Clustering\\DBSCAN\\3d-after-dbscan.jpg')
fig.show()

## Analysis by Hierarchical clustering

In [None]:
#-------------Hierarchical--------------------------------

#-----dendrogram----
linkages = ['ward', 'complete', 'average', 'single']
metrics = ['euclidean', 'manhattan']

for i in linkages:
    plt.title('Link: ' + i)
    if i == 'ward':
        plt.axhline(y=6, color='r', linestyle='--')
    dist = pdist(df_norm[norm_attr], metric='euclidean')
    link = linkage(dist, method=i, metric='euclidean')
    res1 = dendrogram(link, color_threshold=7, truncate_mode='lastp', orientation='top',
                      distance_sort='descending', show_leaf_counts=True)
    plt.savefig('D:\\Dropbox\\Scuola\\Pisa\\Anno2\\Data Mining\\Esame\\Pratica\\python\\Clustering\\Hierarchical\\dendrogram-linkages-' + i + '.jpg')
    plt.show()
    

res2 = []
for k in range(3, 14):
    for l in linkages:
        for metr in metrics:
            if l == 'ward' and metr != 'euclidean':
                continue
            hiera = AgglomerativeClustering(n_clusters=k, affinity=metr, linkage=l).fit(df_norm[norm_attr])
            hist, bins = np.histogram(hiera.labels_, bins=range(0, len(set(hiera.labels_)) + 1))
            res2.append({
                #'hiera': hiera,
                'labels': len(np.unique(hiera.labels_, return_counts=True)[0]), #dict(zip(bins, hist)),
                'n_clusters': k,
                'sil': silhouette_score(df_norm[norm_attr], hiera.labels_, metric = metr),
                'link': l,
                'metric': metr
            })

print(*res2, sep = "\n")

for i in df.columns:
    for z in df.columns:
        if i == z:
            continue
        plt.figure(figsize=(18, 10))
        plt.scatter(df[i], df[z], c=hiera.labels_, cmap='rainbow')
        plt.tick_params(axis='both', which='major', labelsize=22)
        plt.xlabel(i)
        plt.ylabel(z)
        plt.savefig('D:\\Dropbox\\Scuola\\Pisa\\Anno2\\Data Mining\\Esame\\Pratica\\python\\Clustering\\Hierarchical\\hiera-scatter-'+i +'-' +z + '.jpg')
        plt.show()

x = 'clock_speed'
y = 'n_cores'
z = 'price_range'

fig = plt.figure(figsize=(16,12))
fig = go.Figure(data=[go.Scatter3d(
    x=df_norm[x],
    y=df_norm[y],
    z=df_norm[z],
    mode='markers',
    marker=dict(
        size=5,
        color=hiera.labels_, 
        opacity=1
    )
)])
# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0),
            scene = dict(
                xaxis_title= x,
                yaxis_title= y,
                zaxis_title= z)
            )
fig.write_image('D:\\Dropbox\\Scuola\\Pisa\\Anno2\\Data Mining\\Esame\\Pratica\\python\\Clustering\\Hierarchical\\3d-after-hiera.jpg')
fig.show()


    

x = range(3, 14)
fig, ax = plt.subplots()
y1 = [a for a in res2 if a['link'] == 'single' and a['metric'] == 'euclidean']
y2 = [a for a in res2 if a['link'] == 'ward' and a['metric'] == 'euclidean']
y3 = [a for a in res2 if a['link'] == 'complete' and a['metric'] == 'euclidean']
y4 = [a for a in res2 if a['link'] == 'average' and a['metric'] == 'euclidean']
ys = [y1, y2, y3, y4]
for e in ys:
    l = e[0]['link']
    ax.plot(x, [s['sil'] for s in e], label=l)
ax.set_title('Metric: Euclidean')
plt.legend(fontsize=10)
plt.savefig('D:\\Dropbox\\Scuola\\Pisa\\Anno2\\Data Mining\\Esame\\Pratica\\python\\Clustering\\Hierarchical\\hiera-cluster-plot.jpg')
plt.show()

fig, ax = plt.subplots()
y1 = [a for a in res2 if a['link'] == 'single' and a['metric'] == 'manhattan']                               
y3 = [a for a in res2 if a['link'] == 'complete' and a['metric'] == 'manhattan']
y4 = [a for a in res2 if a['link'] == 'average' and a['metric'] == 'manhattan']
ys = [y1, y3, y4]
for e in ys:
    l = e[0]['link']
    ax.plot(x, [s['sil'] for s in e], label=l)
ax.set_title('Metric: Manhattan')
plt.legend(fontsize=10)
plt.savefig('D:\\Dropbox\\Scuola\\Pisa\\Anno2\\Data Mining\\Esame\\Pratica\\python\\Clustering\\Hierarchical\\hiera-manhattan-plot.jpg')
plt.show()


## (OPTIONAL) Alternative clustering algorithm

in the library: https://github.com/annoviko/pyclustering/

## Final Considerations

############################################# END ######################################################

###### FUTURE CONSIDERATIONS
