# **HANDS ON Clusterização**

---

clustering com sklearn

Neste Hands On vamos trabalhar com 2 modelos de clustering diferente
 * K-means
 * Affinity Propagation
 
---

Métodos amplamente utilizados no dia a dia quando se trata deste tipo de tarefa.
Base utilizada é a cms.csv que é uma base que desceve o problema de predizer o método contracepitivo escolhido por um conjunto específico de mulheres do 1987 National Indonesia contraceptive.

In [46]:
#Vamos importar as bibliotecas primeiramente
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [47]:
#Agora vamos ler nosso dataset
df_cms = pd.read_csv(r'C:\Users\Ademir\Desktop\BootCamp IGTI\ML\Módulo 3 - Seleção de Modelos de ML\cmc.csv')

In [48]:
df_cms.head() #conferindo nosso dataset

Unnamed: 0,Wifes_age,Wifes_education,Husbands_education,Number_of_children_ever_born,Wifes_religion,Wifes_now_working%3F,Husbands_occupation,Standard-of-living_index,Media_exposure,Contraceptive_method_used
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1


*   Wifes_age: numerical 
*   Wifes_education: categorical 1=low, 2, 3, 4=high
*   Husbands_education: categorical 1=low, 2, 3, 4=high
*   Number_of_children_ever_born: numerical
*   Wifes_religion: binary 0=Non-Islam, 1=Islam
*   Wifes_now_working: binary 0=Yes, 1=No
*   Husbands_occupation: categorical 1, 2, 3, 4
*   Standard-of-living_index: categorical 1=low, 2, 3, 4=high
*   Media_exposure: binary 0=Good, 1=Not good
*   Contraceptive_method_used: (class attribute) 1=No-use 2=Long-term 3=Short-term

In [49]:
df_cms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1473 entries, 0 to 1472
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype
---  ------                        --------------  -----
 0   Wifes_age                     1473 non-null   int64
 1   Wifes_education               1473 non-null   int64
 2   Husbands_education            1473 non-null   int64
 3   Number_of_children_ever_born  1473 non-null   int64
 4   Wifes_religion                1473 non-null   int64
 5   Wifes_now_working%3F          1473 non-null   int64
 6   Husbands_occupation           1473 non-null   int64
 7   Standard-of-living_index      1473 non-null   int64
 8   Media_exposure                1473 non-null   int64
 9   Contraceptive_method_used     1473 non-null   int64
dtypes: int64(10)
memory usage: 115.2 KB


In [50]:
#Aparantemente os nossos dados estão completos e sem problemas de valores nulos
#Vamos avaliar os valores dos parâmetros
df_cms.describe()

Unnamed: 0,Wifes_age,Wifes_education,Husbands_education,Number_of_children_ever_born,Wifes_religion,Wifes_now_working%3F,Husbands_occupation,Standard-of-living_index,Media_exposure,Contraceptive_method_used
count,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0
mean,32.538357,2.958588,3.429735,3.261371,0.850645,0.749491,2.137814,3.133741,0.073999,1.919891
std,8.227245,1.014994,0.816349,2.358549,0.356559,0.433453,0.864857,0.976161,0.261858,0.876376
min,16.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
25%,26.0,2.0,3.0,1.0,1.0,0.0,1.0,3.0,0.0,1.0
50%,32.0,3.0,4.0,3.0,1.0,1.0,2.0,3.0,0.0,2.0
75%,39.0,4.0,4.0,4.0,1.0,1.0,3.0,4.0,0.0,3.0
max,49.0,4.0,4.0,16.0,1.0,1.0,4.0,4.0,1.0,3.0


## Vamos fazer o pré processamento

In [51]:
#Sabemos com as informações contidas no OpenML que na coluna de metodo contraceptivo usado
#temos 3 métodos e vamos passa-los mapeando-os
name_to_class = {
    1: 0,
    2: 1,
    3: 2
}

df_cms['Contraceptive_method_used'] = df_cms['Contraceptive_method_used'].map(name_to_class)
df_cms.head() #checando

Unnamed: 0,Wifes_age,Wifes_education,Husbands_education,Number_of_children_ever_born,Wifes_religion,Wifes_now_working%3F,Husbands_occupation,Standard-of-living_index,Media_exposure,Contraceptive_method_used
0,24,2,3,3,1,1,2,3,0,0
1,45,1,3,10,1,1,3,4,0,0
2,43,2,3,7,1,1,3,4,0,0
3,42,3,2,9,1,1,3,3,0,0
4,36,3,3,8,1,1,3,2,0,0


In [52]:
#Ela usa o get_dummies para transformar as colunas de dados categóricos, vamos usar também né

df2 = pd.get_dummies(df_cms, columns=['Wifes_education', 'Husbands_education', 'Wifes_religion', 
                                      'Wifes_now_working%3F', 'Husbands_occupation', 
                                      'Standard-of-living_index', 'Media_exposure'])

#Vamos vizualziar como ficou
df2.head()

Unnamed: 0,Wifes_age,Number_of_children_ever_born,Contraceptive_method_used,Wifes_education_1,Wifes_education_2,Wifes_education_3,Wifes_education_4,Husbands_education_1,Husbands_education_2,Husbands_education_3,...,Husbands_occupation_1,Husbands_occupation_2,Husbands_occupation_3,Husbands_occupation_4,Standard-of-living_index_1,Standard-of-living_index_2,Standard-of-living_index_3,Standard-of-living_index_4,Media_exposure_0,Media_exposure_1
0,24,3,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,45,10,0,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,0
2,43,7,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,0
3,42,9,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,1,0,1,0
4,36,8,0,0,0,1,0,0,0,1,...,0,0,1,0,0,1,0,0,1,0


In [53]:
#Vamos analisar novamente os dados após o one hot encoding
df2.describe()

Unnamed: 0,Wifes_age,Number_of_children_ever_born,Contraceptive_method_used,Wifes_education_1,Wifes_education_2,Wifes_education_3,Wifes_education_4,Husbands_education_1,Husbands_education_2,Husbands_education_3,...,Husbands_occupation_1,Husbands_occupation_2,Husbands_occupation_3,Husbands_occupation_4,Standard-of-living_index_1,Standard-of-living_index_2,Standard-of-living_index_3,Standard-of-living_index_4,Media_exposure_0,Media_exposure_1
count,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,...,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0
mean,32.538357,3.261371,0.919891,0.103191,0.226748,0.278344,0.391718,0.029871,0.120842,0.238968,...,0.295995,0.288527,0.397149,0.01833,0.087576,0.155465,0.2926,0.464358,0.926001,0.073999
std,8.227245,2.358549,0.876376,0.304311,0.418871,0.448336,0.4883,0.170289,0.326054,0.426598,...,0.456644,0.453231,0.489473,0.134187,0.282774,0.36247,0.455111,0.498897,0.261858,0.261858
min,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,26.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,32.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,39.0,4.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
max,49.0,16.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [54]:
#Agora vamos passar nosso dataframe para os dados da foram que queremos para treinar nosso modelo
lista_df = list(df2)
y_label = df2['Contraceptive_method_used']
df2 = df2.drop('Contraceptive_method_used', axis = 1)

In [55]:
#Agora que fizemos isso, vamos importar a biblioteca de separação de modelos
from sklearn.model_selection import train_test_split

In [56]:
#Agora vamos fazer a separação dos nossos dados

xtrain, xtest, ytrain, ytest = train_test_split(df2,y_label, test_size = 0.25, random_state = 42)


## Baseline

---

vamos montar um baseline aleatório para comparar com os outros modelos

In [105]:
#Criando o baseline
baseline = np.random.choice([0, 1, 2], size = len(ytest))

print(baseline)

[1 2 2 0 2 1 1 0 0 0 0 1 2 1 1 2 1 0 0 2 0 2 2 1 0 2 2 0 0 0 2 2 2 0 2 2 0
 1 0 2 2 2 1 0 1 1 0 1 1 2 1 0 2 0 2 0 0 2 2 2 1 1 0 0 2 2 0 0 2 2 1 1 1 2
 1 1 1 2 2 0 1 0 2 0 1 2 1 1 1 0 1 1 0 2 2 0 0 0 1 1 1 2 2 1 1 1 0 2 2 1 1
 2 2 0 1 1 1 0 2 1 1 2 0 2 0 0 2 0 1 2 1 1 0 0 1 0 1 1 1 0 0 2 0 0 1 1 2 2
 0 1 2 2 1 2 0 2 1 2 1 1 2 1 0 1 2 0 0 0 2 2 1 2 1 2 0 2 1 2 2 1 2 1 1 0 2
 2 0 2 2 2 1 1 1 0 2 0 1 1 1 0 1 2 2 2 0 2 0 0 0 1 0 0 2 2 2 0 2 2 1 1 0 1
 0 0 2 0 2 0 2 0 1 1 2 1 0 0 0 1 2 2 0 2 1 1 2 2 2 2 2 1 2 2 2 0 2 2 0 0 2
 0 0 2 1 0 2 1 2 1 0 1 0 2 1 1 2 2 0 2 1 0 1 0 0 2 0 2 0 2 1 1 1 0 1 2 2 1
 2 0 2 0 1 0 0 2 0 0 2 0 2 1 2 1 2 0 2 2 2 2 2 0 0 2 0 1 2 0 2 2 2 1 0 2 1
 2 2 1 0 1 0 1 1 0 0 2 2 2 0 2 0 1 1 1 1 2 2 0 1 1 1 0 2 2 2 0 1 2 0 0 2]


In [106]:
#Agora que o baseline foi criado, vamos importar as métricas e avaliar 
from sklearn import metrics
from sklearn.metrics import cluster

In [120]:
#Aplicando as métricas de avaliação
baseline2 = np.array(baseline).reshape(-1,1)
print('Coeficiente de Silhueta\n', metrics.silhouette_score(ytest, baseline))
print('\nDavies-Bouldin Score\n', metrics.davies_bouldin_score(ytest, baseline))

#Outras métricas
print('\nMatriz de Contingência\n', metrics.cluster.contingency_matrix(ytest, baseline))
print('\nMutual Information\n', metrics.mutual_info_score(ytest, baseline))

ValueError: Expected 2D array, got 1D array instead:
array=[2 0 1 0 0 1 1 0 0 0 2 2 1 0 2 2 2 2 0 0 0 1 1 0 0 0 2 1 0 0 1 0 1 1 2 2 2
 2 0 2 0 0 2 1 0 0 2 1 1 2 1 0 0 2 2 2 0 0 1 0 1 2 0 1 2 0 0 0 0 0 2 0 1 0
 2 0 0 1 0 2 2 1 0 1 2 0 2 1 1 0 0 0 2 0 0 0 0 0 0 1 0 2 1 0 2 1 0 2 0 0 0
 0 2 2 1 0 0 0 0 0 0 1 2 2 1 1 2 2 1 0 0 1 1 2 2 0 0 1 0 2 2 1 2 0 2 0 0 2
 0 0 0 0 0 0 2 0 2 0 2 0 2 1 1 0 0 2 0 0 1 0 0 2 2 0 0 2 0 0 1 0 1 0 0 0 0
 0 0 0 2 0 2 0 2 2 2 1 2 1 0 0 1 0 2 1 1 0 0 0 2 0 2 1 2 2 2 1 0 1 0 2 0 1
 0 0 1 2 1 0 2 2 1 1 2 2 0 1 2 2 2 1 0 2 1 2 1 2 1 2 0 2 2 1 0 0 0 0 0 2 2
 0 1 2 0 0 2 2 0 1 0 1 0 2 1 0 1 2 2 2 1 1 1 0 2 0 0 2 2 1 0 1 1 2 1 2 2 2
 0 0 2 2 2 1 0 2 0 1 2 1 1 1 2 0 0 2 1 0 2 2 0 2 2 0 2 1 0 0 1 0 2 0 2 1 2
 2 0 0 2 2 0 0 2 0 0 2 0 0 0 2 0 2 1 0 2 1 0 1 2 0 0 0 2 1 1 1 1 0 2 2 2].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.