# Algoritmo KMEANS

## Imports

In [1]:
# importamos las librerias y bibliotecas que utilizaremos para el algoritmo KMEANS
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.gridspec as gridspec
from collections import Counter
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from pandas import DataFrame
from sklearn.pipeline import Pipeline
from sklearn import metrics
import numpy as np

## Funciones auxiliares

In [2]:
# Original code: https://bit.ly/2TNHBZ5
def plot_data(X, y):
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], 'k.', markersize=2)
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], 'r.', markersize=2)

def plot_centroids(centroids, weights=None, circle_color='w', cross_color='k'):
    if weights is not None:
        centroids = centroids[weights > weights.max() / 10]
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='o', s=30, linewidths=8,
                color=circle_color, zorder=10, alpha=0.9)
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=50, linewidths=50,
                color=cross_color, zorder=11, alpha=1)

def plot_decision_boundaries(clusterer, X, y, resolution=1000, show_centroids=True):
    mins = X.min(axis=0) - 0.1
    maxs = X.max(axis=0) + 0.1
    xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
                         np.linspace(mins[1], maxs[1], resolution))
    Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
                cmap="Pastel2")
    plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
                linewidths=1, colors='k')
    plot_data(X, y)
    if show_centroids:
        plot_centroids(clusterer.cluster_centers_)

In [3]:
def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

## Lectura del conjunto de datos

In [4]:
df_ini = pd.read_csv('dataset-equilibrado-1.csv')

## Visualización del conjunto de datos

In [5]:
df_ini 

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,9046,67.00,0,1,228.69,36.6,1
1,51676,61.00,0,0,202.21,,1
2,31112,80.00,0,1,105.92,32.5,1
3,60182,49.00,0,0,171.23,34.4,1
4,1665,79.00,1,0,174.12,24.0,1
...,...,...,...,...,...,...,...
1778,45554,1.24,0,0,62.40,22.1,0
1779,32884,80.00,1,0,210.96,31.8,0
1780,55744,2.00,0,0,76.25,20.1,0
1781,28414,50.00,0,0,103.48,29.1,0


In [None]:
#Validamos la cantidad de ejemplos que posee el dataset
df_ini["stroke"].value_counts()

## Preparación del conjunto de datos

In [None]:
#Eliminamos datos que son irrelevantes y que podrian alterar el comportamiento del algoritmo
df_ini = df_ini.drop(["id"], axis=1)

In [None]:
#Función para transformar las columnas categoricas a numericas
df= pd.get_dummies(df_ini, columns=['gender','ever_married','work_type','Residence_type','smoking_status'])

In [None]:
#Validamos que los cambios se aplicarón correctamente 
df

In [None]:
# Representamos gráficamente las características
features = df.drop("stroke", axis=1)

plt.figure(figsize=(12,32))
gs = gridspec.GridSpec(8, 4)
gs.update(hspace=0.8)

for i, f in enumerate(features):
    ax = plt.subplot(gs[i])
    sns.distplot(df[f][df["stroke"] == 1])
    sns.distplot(df[f][df["stroke"] == 0])
    ax.set_xlabel('')
    ax.set_title('feature: ' + str(f))

plt.show()

In [None]:
# Representación gráfica de dos características
plt.figure(figsize=(12, 6))
plt.scatter(df["age"][df['stroke'] == 0], df["avg_glucose_level"][df['stroke'] == 0], c="y", marker=".")
plt.scatter(df["age"][df['stroke'] == 1], df["avg_glucose_level"][df['stroke'] == 1], c="r", marker=".")
plt.xlabel("age", fontsize=14)
plt.ylabel("avg_glucose_level", fontsize=14)
plt.show()

## Kmeans con un conjunto de datos de dos dimensiones

In [None]:
X_2 = df[["age", "avg_glucose_level"]].copy()
y = df["stroke"].copy()
X_2

In [None]:
# Construcción de un pipeline para rellenar los valores nulos con la mediana y scalar los datos 
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('rbst_scaler', RobustScaler()),
    ])

In [None]:
# Rellenamos los valores nulos
X = num_pipeline.fit_transform(X_2)

In [None]:
# Transformamos el resultado a un DataFrame de Pandas
X = pd.DataFrame(X, columns=X_2.columns, index=y_df.index)

In [None]:
#Validamos que los cambios se aplicaron correctamente 
X

In [None]:
# Generamos los clusters para nuestro conjunto de datos sin etiquetar
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X)

In [None]:
plt.figure(figsize=(12, 6))
plot_decision_boundaries(kmeans, X.values, df["stroke"].values)
plt.xlabel("age", fontsize=14)
plt.ylabel("avg_glucose_level", fontsize=14)
plt.show()

In [None]:
counter = Counter(clusters.tolist())
bad_counter = Counter(clusters[df['stroke'] == 1].tolist())

for key in sorted(counter.keys()):
    print("Label {0} has {1} samples - {2} are stroke samples".format(
        key, counter[key], bad_counter[key]))

## Kmeans con un conjunto de datos multidimensional

In [None]:
X_df = df.drop("stroke", axis=1)
y_df = df["stroke"].copy()

In [None]:
# Construcción de un pipeline para rellenar los valores nulos con la mediana y scalar los datos 
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('rbst_scaler', RobustScaler()),
    ])

In [None]:
# Rellenamos los valores nulos
X = num_pipeline.fit_transform(X_df)

In [None]:
# Transformamos el resultado a un DataFrame de Pandas
X = pd.DataFrame(X, columns=X_df.columns, index=y_df.index)

In [None]:
#Validamos que los cambios se aplicaron correctamente 
X

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X)

In [None]:
# Evaluamos los clusters y el contenido que se han formado
counter = Counter(clusters.tolist())
bad_counter = Counter(clusters[y_df == 1].tolist())

for key in sorted(counter.keys()):
    print("Label {0} has {1} samples - {2} are stroke samples".format(
        key, counter[key], bad_counter[key]))

## Reducción del número de características

### Aplicamos selección de características con _Random Forest_


In [None]:
# Utilizamos Random Forest para realizar selección de características
from sklearn.ensemble import RandomForestClassifier

clf_rnd = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
clf_rnd.fit(X, y_df)

In [None]:
# Seleccionamos las características más importantes
feature_importances = {name: score for name, score in zip(list(X), clf_rnd.feature_importances_)}
feature_importances_sorted = pd.Series(feature_importances).sort_values(ascending=False)

In [None]:
# Reducimos el conjunto de datos a las 7 características más importantes
X_reduced = X[list(feature_importances_sorted.head(5).index)].copy()

In [None]:
X_reduced

### Entrenamiento de KMEANS con el conjunto de datos reducido

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X_reduced)

In [None]:
# Evaluamos los clusters y el contenido que se han formado
counter = Counter(clusters.tolist())
bad_counter = Counter(clusters[y_df == 1].tolist())

for key in sorted(counter.keys()):
    print("Label {0} has {1} samples - {2} are stroke samples".format(
        key, counter[key], bad_counter[key]))

## Evaluación de los resultados

In [None]:
# Calculamos el purity score, es importante darse cuenta de que recibe las etiquetas
print("Purity Score:", purity_score(y_df, clusters))

In [None]:
# Calculamos el coeficiente de Shiloutte, es importante darse cuenta de que no le pasamos las etiquetas
print("Shiloutte: ", metrics.silhouette_score(X_reduced, clusters, sample_size=10000))

In [None]:
# Calculamos el Calinski harabasz score, es importante darse cuenta de que no le pasamos las etiquetas
print("Calinski harabasz: ", metrics.calinski_harabasz_score(X_reduced, clusters))