# TP Apprentissage non-supervisé
## Clustering
## Laure FEUILLET - Maël PLANTEC

## I - Jeux de données

In [None]:
from scipy.io import arff
import numpy as np
import matplotlib.pyplot as plt

In [None]:
smile_data = arff.loadarff(open('smile1.arff', 'rt'))
smile = np.array(smile_data)[0]

In [None]:
plt.scatter(smile['a0'], smile['a1'], c=smile['class'], marker='x')
plt.show()

In [None]:
square_data = arff.loadarff(open('square5.arff', 'rt'))
square = np.array(square_data)[0]

In [None]:
plt.scatter(square['a0'], square['a1'], c=square['class'], marker='x')
plt.show()

In [None]:
plt.scatter(square['a0'], square['a1'], marker='x')
plt.show()

## II - Clustering k-Means
Le dataset `smile` possède des composantes connexes bien identifiées, avec des densités variables : les yeux sont plus denses que la bouche par exemple. 

In [None]:
from sklearn import cluster
from sklearn import metrics

In [None]:
square_train = list(zip(square['a0'], square['a1']))

In [None]:
kmeans_square  = cluster.KMeans(n_clusters=len(np.unique(square['class'])), init='k-means++')
kmeans_square.fit(square_train)

In [None]:
metrics.davies_bouldin_score(square_train, kmeans_square.labels_)

In [None]:
plt.scatter(square['a0'], square['a1'], c=kmeans_square.labels_, marker='x')
plt.show()

In [None]:
range_clusters = range(2, 11)
kmeans_nb_clusters = []
for nb_clusters in range_clusters:
    kmeans  = cluster.KMeans(n_clusters=nb_clusters, init='k-means++')
    kmeans.fit(square_train)
    # Indice de Davies Bouldin
    db = metrics.davies_bouldin_score(square_train, kmeans.labels_)
    # Coefficient de silhouette
    silhouette = metrics.silhouette_score(square_train, kmeans.labels_)
    kmeans_nb_clusters.append((nb_clusters, kmeans.labels_, db, silhouette))

In [None]:
plt.figure(figsize=(16, 9))

i=0
for res in kmeans_nb_clusters:
    plt.subplot(3, 3, i+1)
    plt.scatter(square['a0'], square['a1'], c=res[1], marker='x', cmap='tab10')
    plt.title(f"{res[0]} clusters")
    i = i + 1
    
plt.show()

In [None]:
kmeans_nb_clusters_zip = list(zip(*kmeans_nb_clusters))

plt.figure(figsize=(16,9))
plt.plot(kmeans_nb_clusters_zip[0], kmeans_nb_clusters_zip[2], 'o-', label='Indice de Davies Bouldin')
plt.plot(kmeans_nb_clusters_zip[0], kmeans_nb_clusters_zip[3], 'x-', label='Coefficient de Silhouette', color='orange')
plt.title("Evaluation de k-means en fonction du nombre de clusters")
plt.xlabel('Nombre de clusters')
plt.ylabel('Evaluation')
plt.legend()
plt.show()

On remarque que... 

In [None]:
kmeans_smile = cluster.KMeans(n_clusters=len(np.unique(smile['class'])), init='k-means++')

In [None]:
smile_train = list(zip(smile['a0'], smile['a1']))

In [None]:
kmeans_smile.fit(smile_train)

In [None]:
plt.scatter(smile['a0'], smile['a1'], c=kmeans_smile.labels_, marker='x')
plt.show()

In [None]:
range_clusters = range(2, 11)
kmeans_nb_clusters = []
for nb_clusters in range_clusters:
    kmeans  = cluster.KMeans(n_clusters=nb_clusters, init='k-means++')
    kmeans.fit(smile_train)
    # Indice de Davies Bouldin
    db = metrics.davies_bouldin_score(smile_train, kmeans.labels_)
    # Coefficient de silhouette
    silhouette = metrics.silhouette_score(smile_train, kmeans.labels_)
    kmeans_nb_clusters.append((nb_clusters, kmeans.labels_, db, silhouette))

In [None]:
plt.figure(figsize=(16, 9))

i=0
for res in kmeans_nb_clusters:
    plt.subplot(3, 3, i+1)
    plt.scatter(smile['a0'], smile['a1'], c=res[1], marker='x', cmap='tab10')
    plt.title(f"{res[0]} clusters")
    i = i + 1
    
plt.show()

In [None]:
kmeans_nb_clusters_zip = list(zip(*kmeans_nb_clusters))

plt.figure(figsize=(16,9))
plt.plot(kmeans_nb_clusters_zip[0], kmeans_nb_clusters_zip[2], 'o-', label='Indice de Davies Bouldin')
plt.plot(kmeans_nb_clusters_zip[0], kmeans_nb_clusters_zip[3], 'x-', label='Coefficient de Silhouette', color='orange')
plt.title("Evaluation de k-means en fonction du nombre de clusters")
plt.xlabel('Nombre de clusters')
plt.ylabel('Evaluation')
plt.legend()
plt.show()

Non, nous n'arrivons pas à retrouver le résultat précédent (4 clusters). Cela est tout à fait normal car le dataset utilisé n'est pas valable pour kmeans (non connexe).