# 4.3 - Evaluación

### Silhoutte Score

$$S=\frac{b(v)-a(v)}{max(a,b)}$$

donde:

+ v:= vector de datos
+ a(v):=distancia media desde v al resto de vectores del mismo cluster
+ b(v):=distancia media desde v a los vectores del cluster más cercano



S pertenece al intervalo [-1,1], de tal manera que:

+ S=1  => buena clusterizacion
+ S=0  => overlapping
+ S=-1 => mala clusterizacion

In [None]:
from sklearn import cluster, datasets

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
%matplotlib inline

from mpl_toolkits.mplot3d import Axes3D


import numpy as np

In [None]:
X,y=datasets.make_moons(n_samples=1500, noise=.05)

In [None]:
X=StandardScaler().fit_transform(X)

In [None]:
single=cluster.AgglomerativeClustering(n_clusters=2, linkage='single')
single.fit(X)

y1_pred=single.labels_

In [None]:
kmeans=cluster.KMeans(n_clusters=2)
kmeans.fit(X)

y2_pred=kmeans.predict(X)

**plots**

In [None]:
fig,ax = plt.subplots(1, 2, figsize=(15, 8))

colors=np.array(['blue', 'red'])

legend_e=[Line2D([0],[0], color=c, marker='o', linestyle='',
                 label='Clase {}'.format(i))  for i,c in enumerate(colors)]


ax[0].scatter(X[:,0], X[:, 1], color=colors[y1_pred])
ax[0].legend(handles=legend_e, loc='upper right')


ax[1].scatter(X[:,0], X[:, 1], color=colors[y2_pred])
ax[1].legend(handles=legend_e, loc='upper right');

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
silhouette_score(X, y1_pred)

In [None]:
print ('Modelo 1 - Hierarchical : Silhouette Score:{}'.format(silhouette_score(X, y1_pred)))
print ('Modelo 2 - KMeans       : Silhouette Score:{}'.format(silhouette_score(X, y2_pred)))

In [None]:
from sklearn.metrics import silhouette_samples


silhouette_samples(X, y1_pred)[:10]

https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

# Elbow

**nº óptimo de clusters**


![elbow](images/elbow.png)

$$WSS=Inercia=\sum_{i=0}^{n} min(|x_i - \bar{x}|^{2})$$

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
X,y=datasets.make_blobs(n_features=3, centers=2)

In [None]:
fig=plt.figure()

ax=fig.add_subplot(111, projection='3d')

ax.scatter(X[:,0], X[:,1], X[:,2], c=y);

In [None]:
cluster.KMeans(n_clusters=4).fit(X).inertia_

In [None]:
[cluster.KMeans(n_clusters=i+1).fit(X).inertia_ for i in range(10)]

In [None]:
n_clusters, inercia = zip(*[(k, cluster.KMeans(n_clusters=k).fit(X).inertia_) for k in range(1, 10)])

In [None]:
n_clusters, inercia

In [None]:
fig=plt.figure()

ax=fig.add_subplot()

ax.plot(n_clusters, inercia)

plt.xlabel('N clusters')
plt.ylabel('Inercia');

In [None]:
%pip install yellowbrick

In [None]:
from yellowbrick.cluster import KElbowVisualizer

In [None]:
modelo=cluster.KMeans()

visual=KElbowVisualizer(modelo, k=(1, 15))

visual.fit(X)

visual.poof();

In [None]:
X,y=datasets.make_moons(n_samples=1500, noise=.05)
X=StandardScaler().fit_transform(X)

In [None]:
inercias=[]

for i in range(1, 10):
    
    kmeans=cluster.KMeans(n_clusters=i).fit(X)
    
    inercias.append(kmeans.inertia_)

In [None]:
plt.plot(range(1, 10), inercias);

In [None]:
visual=KElbowVisualizer(modelo, k=(1, 15))
visual.fit(X)

visual.poof();

# Rand Score

$$R=\frac{(a+b)}{(a+b+c+d)}$$


pertece al intervalo [0,1]

Sea  S={ s_1, $\ldots{}$, s_n }; X={ x_1, $\ldots{}$, x_n } partición de S e Y={ y_1, $\ldots{}$, y_n } partición de S, entonces:

+ a:=nº de pares de elementos en S que están en el mismo subconjunto de X e Y
+ b:=nº de pares de elementos en S que están en diferentes subconjuntos de X e Y
+ c:=nº de pares de elementos en S que están en el mismo subconjunto de X y en diferente subconjunto de Y
+ d:=nº de pares de elementos en S que están en diferente subconjunto de X y en el mismo subconjunto de Y



Intuitivamente, _a+b_ es la correspondencia entre X e Y, c+d es la discordancia.


https://es.wikipedia.org/wiki/Partici%C3%B3n_de_un_conjunto

http://i3campus.co/CONTENIDOS/wikipedia/content/a/partici%25c3%25b3n_(matem%25c3%25a1tica).html#:~:text=En%20matem%C3%A1ticas%2C%20una%20partici%C3%B3n%20de,en%20subconjuntos%20disjuntos%20no%20vac%C3%ADos.

**Adjusted Rand Score**


$$AR=\frac{R - ExpectedR}{max(R) - ExpectedR}$$

In [None]:
X,y=datasets.make_moons(n_samples=1500, noise=.05)
X=StandardScaler().fit_transform(X)

In [None]:
single=cluster.AgglomerativeClustering(n_clusters=2, linkage='single')
single.fit(X)

y1_pred=single.labels_

In [None]:
kmeans=cluster.KMeans(n_clusters=2)
kmeans.fit(X)

y2_pred=kmeans.predict(X)

In [None]:
from sklearn.metrics import adjusted_rand_score as ars

In [None]:
ars(y, y1_pred)      # jerarquico

In [None]:
ars(y, y2_pred)      # kmeans

# Visualización de Clusters

In [None]:
X, _= datasets.make_blobs(n_samples=1500, centers=4, cluster_std=1.5)

In [None]:
X.shape

In [None]:
kmeans=cluster.KMeans(n_clusters=4).fit(X)

In [None]:
y_pred=kmeans.predict(X)

y_pred[:10]

In [None]:
plt.figure(figsize=(12,6))

plt.scatter(X[:,0], X[:,1], 
            c=y_pred);

In [None]:
centers=kmeans.cluster_centers_

In [None]:
plt.figure(figsize=(12,6))
plt.scatter(X[:,0], X[:,1], c=y_pred)

plt.scatter(centers[:,0], centers[:,1], c='red', s=50, alpha=0.75);

**otro plot en 3D**

In [None]:
from scipy.spatial.transform import Rotation as R

In [None]:
X, _= datasets.make_blobs(n_samples=1500, centers=4, cluster_std=1.)

X=np.c_[X, 0.5*np.random.randn(X.shape[0], 1)]

rot=R.from_euler('y', 45, degrees=True)

X=rot.apply(X)

In [None]:
fig=plt.figure(figsize=(12, 8))

ax=fig.add_subplot(111, projection='3d')

ax.scatter(X[:,0],X[:,1],X[:,2])
ax.set_xlim(-7, 7)
ax.set_ylim(-7, 7)
ax.set_zlim(-7, 7)
ax.view_init(elev=25, azim=45)
plt.show();

**reduccion de dimensiones-viz**

In [None]:
from sklearn.decomposition import PCA

In [None]:
X.shape

In [None]:
pca=PCA().fit(X)

pca_x=pca.transform(X)

pca_x.shape

In [None]:
fig=plt.figure(figsize=(12, 8))

ax=[plt.subplot2grid((6, 1), (0, 0), rowspan=4),
    plt.subplot2grid((6, 1), (5, 0))]

ax[0].scatter(pca_x[:,0], pca_x[:,1])

ax[0].set_xlabel('PCA 1')
ax[0].set_ylabel('PCA 2')

ax[1].set_title('PCA 3', pad=-5)
ax[1].hlines(1, -7, 20)

y=np.ones(X.shape[0])

ax[1].plot(pca_x[:,2], y, '|', ms='20')
ax[1].axis('off')

plt.show();

**t-SNE**

In [None]:
from sklearn.manifold import TSNE

In [None]:
X,y = datasets.load_iris(return_X_y=True)

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(15, 7))

for i,p in enumerate([5, 30, 50, 100]):
    
    tsne=TSNE(perplexity=p)
    
    x_emb=tsne.fit_transform(X)
    
    ax[i].scatter(x_emb[:,0], x_emb[:,1], c=y)
    ax[i].set_title('Perplexity={}'.format(p))
    
plt.show();

**UMAP - MNIST**

In [None]:
digitos=datasets.load_digits()

digitos.data.shape

In [None]:
digitos.data[300].reshape(8, 8)

In [None]:
digitos.target[300]

In [None]:
plt.imshow(digitos.data[300].reshape(8, 8));

In [None]:
import pandas as pd

import warnings
warnings.simplefilter('ignore')

from umap import UMAP

In [None]:
def get_umap(data, n):
    
    umap=UMAP(n_components=n)
    emb=umap.fit_transform(data)
    
    return pd.DataFrame(emb, columns=[f'emb_{i+1}' for i in range(n)])

In [None]:
umap_df = get_umap(digitos.data, 2)

umap_df.head()

In [None]:
digitos.data.shape

In [None]:
umap_df.shape

In [None]:
plt.scatter(umap_df.emb_1, 
            umap_df.emb_2, 
            c=digitos.target, 
            cmap='Spectral', 
            s=5)


plt.gca().set_aspect('equal', 'datalim')

plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))

plt.title('UMAP Projection');

**umap supervisado**

In [None]:
emb = UMAP().fit_transform(digitos.data, y=digitos.target)


umap_df=pd.DataFrame(emb, columns=['emb_1', 'emb_2'])


plt.scatter(umap_df.emb_1, umap_df.emb_2, c=digitos.target, cmap='Spectral', s=5)

plt.gca().set_aspect('equal', 'datalim')

plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))

plt.title('UMAP Projection');


In [None]:
umap_df[umap_df.emb_1<-14].head()

In [None]:
emb = UMAP().fit(digitos.data, y=digitos.target)

emb

In [None]:
inverso = emb.inverse_transform([[3, -3]])

plt.imshow(inverso.reshape(8, 8));

**umap en moons y en blobs**

In [None]:
X,y=datasets.make_moons(n_samples=500, shuffle=True, noise=.05)

plt.scatter(X[:,0], X[:,1]);

In [None]:
umap_df=get_umap(X, 2)

plt.scatter(umap_df.emb_1, umap_df.emb_2);

In [None]:
umap_df=get_umap(X, 3)

fig=plt.figure()
ax=fig.add_subplot(111, projection='3d')

ax.scatter(umap_df.emb_1, umap_df.emb_2, umap_df.emb_3);

In [None]:
# blobs

X,y=datasets.make_blobs(n_features=8, centers=5)

plt.scatter(X[:,2], X[:,1]);

In [None]:
X.shape

In [None]:
umap_df=get_umap(X, 2)

plt.scatter(umap_df.emb_1, umap_df.emb_2);

In [None]:
umap_df=get_umap(X, 3)

fig=plt.figure()
ax=fig.add_subplot(111, projection='3d')

ax.scatter(umap_df.emb_1, umap_df.emb_2, umap_df.emb_3);

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
dbscan=DBSCAN()

pred=dbscan.fit(umap_df).labels_

In [None]:
pred

In [None]:
fig=plt.figure()
ax=fig.add_subplot(111, projection='3d')

ax.scatter(umap_df.emb_1, umap_df.emb_2, umap_df.emb_3, c=pred);

In [None]:
ars(y, pred)

https://towardsdatascience.com/how-to-evaluate-unsupervised-learning-models-3aa85bd98aa2


https://towardsdatascience.com/evaluating-goodness-of-clustering-for-unsupervised-learning-case-ccebcfd1d4f1