### 1. Loading Libraries and Data

In [228]:
#Basic needed libraries
import pandas as pd

#Preprocessing
from sklearn.preprocessing import LabelEncoder

#Metrics and models
from sklearn.metrics import silhouette_score
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes

In [160]:
data = pd.DataFrame(data={
    'nivel_cargo': ['junior', 'senior', 'junior', 'pleno', 'pleno', 'senior'],
    'tempo_carreira': [1, 7, 2, 5, 4, 10],
    'salario': [2000, 10500, 1500, 6800, 5500, 12000]
})

### 2. Data Preparation

In [210]:
data_modes = data.copy()

In [211]:
data_modes

Unnamed: 0,nivel_cargo,tempo_carreira,salario
0,junior,1,2000
1,senior,7,10500
2,junior,2,1500
3,pleno,5,6800
4,pleno,4,5500
5,senior,10,12000


In [216]:
data_modes.describe()

Unnamed: 0,tempo_carreira,salario
count,6.0,6.0
mean,4.833333,6383.333333
std,3.311596,4301.356375
min,1.0,1500.0
25%,2.5,2875.0
50%,4.5,6150.0
75%,6.5,9575.0
max,10.0,12000.0


In [217]:
data_modes['tempo_carreira'] = data_modes.apply(lambda x: 'Iniciante' if x['tempo_carreira']<5 else 'Experiente', axis=1)
data_modes['salario'] = data_modes.apply(lambda x: 'Remuneração baixa' if x['salario']<5000 else 'Remuneração alta', axis=1)

In [218]:
data_modes

Unnamed: 0,nivel_cargo,tempo_carreira,salario
0,junior,Iniciante,Remuneração baixa
1,senior,Experiente,Remuneração alta
2,junior,Iniciante,Remuneração baixa
3,pleno,Experiente,Remuneração alta
4,pleno,Iniciante,Remuneração alta
5,senior,Experiente,Remuneração alta


### 3. Model

#### 3.1. KPrototype model

In [162]:
categorical_indices = [0]
numerical_indices = [1, 2]

In [221]:
kp = KPrototypes(n_clusters=3, init='Cao', n_init=1, verbose=True)
kp_clusters = kp.fit_predict(data.values, categorical=categorical_indices)

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 0, ncost: 2095005.5


#### 3.2. KModes model

In [220]:
km = KModes(n_clusters=3, init='Cao', n_init=1, verbose=True)
km_clusters = km.fit_predict(data_modes)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 1.0


### 4. Avaliação dos clusters

#### 4.1. Visualizando os clusters

In [222]:
print(kp_clusters)
print(km_clusters)

[0 1 0 2 2 1]
[1 0 1 0 2 0]


In [223]:
data

Unnamed: 0,nivel_cargo,tempo_carreira,salario
0,junior,1,2000
1,senior,7,10500
2,junior,2,1500
3,pleno,5,6800
4,pleno,4,5500
5,senior,10,12000


#### 4.2. Cálculo do índice de silhueta

In [230]:
#Transformando variaveis categoricas em numericas para o cáculo do índice Silhouette
for col in ['nivel_cargo']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

In [231]:
data

Unnamed: 0,nivel_cargo,tempo_carreira,salario
0,0,1,2000
1,2,7,10500
2,0,2,1500
3,1,5,6800
4,1,4,5500
5,2,10,12000


In [236]:
kp_silhouette = silhouette_score(data.values, kp_clusters)
kp_silhouette

0.7553250749421779

In [237]:
km_silhouette = silhouette_score(data.values, km_clusters)
km_silhouette

0.3314820227607887

Conclusão: De acordo com o índice de Silhouette, com a base de dados utilizada e os tratamentos feitos, o modelo KPrototype teve melhor desempenho do que o modelo KModes.