In [39]:
import pandas as pd
import numpy as np
from utils import do_cv_knn, do_cv_svm, imprimir_estatisticas, selecionar_melhor_k_knn
from tqdm.notebook import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
df = pd.read_csv("employee.csv")
df.drop(df.columns[:-4], axis=1, inplace=True)

In [3]:
y = df['left'].values
X = df.drop('left', axis=1)
df

Unnamed: 0,promotion_last_5years,department,salary,left
0,0,sales,low,1
1,0,sales,medium,1
2,0,sales,medium,1
3,0,sales,low,1
4,0,sales,low,1
...,...,...,...,...
14994,0,support,low,1
14995,0,support,low,1
14996,0,support,low,1
14997,0,support,low,1


In [4]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [5]:
transformers = [
    ('oe_department', OrdinalEncoder(), ['department'] ),
    ('oe_salary', OrdinalEncoder(
                    categories=[['low', 'medium', 'high']]), ['salary'])
]

In [8]:
ct = ColumnTransformer(transformers, remainder='passthrough')
X_oe = ct.fit_transform(X)
X_oe

array([[7., 0., 0.],
       [7., 1., 0.],
       [7., 1., 0.],
       ...,
       [8., 0., 0.],
       [8., 0., 0.],
       [8., 0., 0.]])

In [9]:
accs = do_cv_knn(X_oe, y, 10, range(1,20,2))

Folds avaliados:   0%|          | 0/10 [00:00<?, ?it/s]

In [10]:
imprimir_estatisticas(accs)

Resultados: 0.75 +- 0.02, min: 0.70, max: 0.76


In [9]:
accs_svm = do_cv_svm(X_oe, y, 10, [1, 10, 100, 1000], ['auto', 'scale'])

Folds avaliados:   0%|          | 0/10 [00:00<?, ?it/s]

In [10]:
imprimir_estatisticas(accs_svm)

Resultados: 0.76 +- 0.00, min: 0.76, max: 0.76


In [11]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
transformers = [
    ('oe_department', OneHotEncoder(), ['department'] ),
    ('oe_salary', OneHotEncoder(), ['salary'])
]

In [13]:
ct_oh = ColumnTransformer(transformers, remainder='passthrough')
X_oh = ct_oh.fit_transform(X).todense()
X_oh, X_oh.shape

(matrix([[0., 0., 0., ..., 1., 0., 0.],
         [0., 0., 0., ..., 0., 1., 0.],
         [0., 0., 0., ..., 0., 1., 0.],
         ...,
         [0., 0., 0., ..., 1., 0., 0.],
         [0., 0., 0., ..., 1., 0., 0.],
         [0., 0., 0., ..., 1., 0., 0.]]),
 (14999, 14))

In [16]:
ct_oh.get_feature_names()



['oe_department__x0_IT',
 'oe_department__x0_RandD',
 'oe_department__x0_accounting',
 'oe_department__x0_hr',
 'oe_department__x0_management',
 'oe_department__x0_marketing',
 'oe_department__x0_product_mng',
 'oe_department__x0_sales',
 'oe_department__x0_support',
 'oe_department__x0_technical',
 'oe_salary__x0_high',
 'oe_salary__x0_low',
 'oe_salary__x0_medium',
 'promotion_last_5years']

In [17]:
accs = do_cv_knn(X_oh, y, 10, range(1,20,2))
imprimir_estatisticas(accs)

Folds avaliados:   0%|          | 0/10 [00:00<?, ?it/s]





Resultados: 0.75 +- 0.01, min: 0.73, max: 0.76


In [18]:
transformers = [
    ('oe_department', OneHotEncoder(), ['department'] ),
    ('oe_salary', OrdinalEncoder(
                    categories=[['low', 'medium', 'high']]), ['salary'])
]
ct_ohoe = ColumnTransformer(transformers, remainder='passthrough')
X_ohoe = ct_ohoe.fit_transform(X).todense()
X_ohoe, X_ohoe.shape

(matrix([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 1., 0.],
         [0., 0., 0., ..., 0., 1., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
 (14999, 12))

In [19]:
accs = do_cv_knn(X_ohoe, y, 10, range(1,20,2))
imprimir_estatisticas(accs)

Folds avaliados:   0%|          | 0/10 [00:00<?, ?it/s]





Resultados: 0.75 +- 0.01, min: 0.73, max: 0.76


# Valores Faltantes

In [20]:
df = pd.read_csv("employee_nans.csv")
df.drop(df.columns[:-4], axis=1, inplace=True)
y = df['left'].values
X = df.drop('left', axis=1)
df

Unnamed: 0,promotion_last_5years,department,salary,left
0,0.0,sales,low,1
1,0.0,sales,medium,1
2,0.0,sales,medium,1
3,0.0,sales,low,1
4,0.0,sales,low,1
...,...,...,...,...
14994,0.0,support,low,1
14995,0.0,support,low,1
14996,0.0,support,low,1
14997,0.0,support,low,1


In [22]:
df.isnull().sum()

promotion_last_5years    737
department               765
salary                   777
left                       0
dtype: int64

In [23]:
df.isnull().sum().sum()

2279

## Excluir as instâncias com valores faltantes

In [24]:
df.dropna(inplace=True)
df.shape

(12815, 4)

In [25]:
df

Unnamed: 0,promotion_last_5years,department,salary,left
0,0.0,sales,low,1
1,0.0,sales,medium,1
2,0.0,sales,medium,1
3,0.0,sales,low,1
4,0.0,sales,low,1
...,...,...,...,...
14994,0.0,support,low,1
14995,0.0,support,low,1
14996,0.0,support,low,1
14997,0.0,support,low,1


In [26]:
y = df['left'].values
X = df.drop('left', axis=1)

In [27]:
transformers = [
    ('oe_department', OneHotEncoder(), ['department'] ),
    ('oe_salary', OrdinalEncoder(
                    categories=[['low', 'medium', 'high']]), ['salary'])
]
ct_ohoe = ColumnTransformer(transformers, remainder='passthrough')
X_ohoe = ct_ohoe.fit_transform(X).todense()
X_ohoe, X_ohoe.shape

(matrix([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 1., 0.],
         [0., 0., 0., ..., 0., 1., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
 (12815, 12))

In [57]:
X_ohoe.shape

(12815, 12)

In [28]:
accs = do_cv_knn(X_ohoe, y, 10, range(1,20,2))
imprimir_estatisticas(accs)

Folds avaliados:   0%|          | 0/10 [00:00<?, ?it/s]





Resultados: 0.76 +- 0.01, min: 0.73, max: 0.76


## Substituição de valores faltantes (*imputação*)

In [29]:
df = pd.read_csv("employee_nans.csv")
df.drop(df.columns[:-4], axis=1, inplace=True)
y = df['left'].values
X = df.drop('left', axis=1)
df

Unnamed: 0,promotion_last_5years,department,salary,left
0,0.0,sales,low,1
1,0.0,sales,medium,1
2,0.0,sales,medium,1
3,0.0,sales,low,1
4,0.0,sales,low,1
...,...,...,...,...
14994,0.0,support,low,1
14995,0.0,support,low,1
14996,0.0,support,low,1
14997,0.0,support,low,1


In [30]:
from sklearn.impute import SimpleImputer

In [31]:
atrib_numericos = ['promotion_last_5years']
atrib_categoricos = ['department', 'salary']

transformers = [
    ('imp_num', SimpleImputer(strategy='most_frequent'), atrib_numericos),
    ('imp_cat', SimpleImputer(strategy='constant', 
                              fill_value='desconhecido'),
                              atrib_categoricos)
]

ct_imp = ColumnTransformer(
    transformers
)
X_imp_values = ct_imp.fit_transform(X)

In [33]:
X_imp_values

array([[0.0, 'sales', 'low'],
       [0.0, 'sales', 'medium'],
       [0.0, 'sales', 'medium'],
       ...,
       [0.0, 'support', 'low'],
       [0.0, 'support', 'low'],
       [0.0, 'support', 'low']], dtype=object)

In [34]:
X_imputed = pd.DataFrame(X_imp_values, 
                         columns=[*atrib_numericos, *atrib_categoricos])

In [35]:
X_imputed

Unnamed: 0,promotion_last_5years,department,salary
0,0.0,sales,low
1,0.0,sales,medium
2,0.0,sales,medium
3,0.0,sales,low
4,0.0,sales,low
...,...,...,...
14994,0.0,support,low
14995,0.0,support,low
14996,0.0,support,low
14997,0.0,support,low


In [36]:
X_imputed.isnull().sum().sum()

0

In [37]:
transformers = [
    ('oe_department', OneHotEncoder(), ['department'] ),
    ('oe_salary', OrdinalEncoder(), ['salary'])
]
ct_ohoe = ColumnTransformer(transformers, remainder='passthrough')
X_ohoe = ct_ohoe.fit_transform(X_imputed).todense()
X_ohoe, X_ohoe.shape

(matrix([[0., 0., 0., ..., 0., 2., 0.],
         [0., 0., 0., ..., 0., 3., 0.],
         [0., 0., 0., ..., 0., 3., 0.],
         ...,
         [0., 0., 0., ..., 0., 2., 0.],
         [0., 0., 0., ..., 0., 2., 0.],
         [0., 0., 0., ..., 0., 2., 0.]]),
 (14999, 13))

In [77]:
X_ohoe.shape

(14999, 13)

In [38]:
accs = do_cv_knn(X_ohoe, y, 10, range(1,20,2))
imprimir_estatisticas(accs)

Folds avaliados:   0%|          | 0/10 [00:00<?, ?it/s]





Resultados: 0.76 +- 0.00, min: 0.75, max: 0.76


In [None]:
pd.re

In [43]:
def do_cv_knn(X, y, cv_splits, ks, imputer=None, encoder=None):

    skf = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=1)

    acuracias = []
    classification_reports = []
    
    pgb = tqdm(total=cv_splits, desc='Folds avaliados')
    
    for treino_idx, teste_idx in skf.split(X, y):

        if isinstance(X, pd.DataFrame):
            X_treino = X.iloc[treino_idx]
            X_teste = X.iloc[teste_idx]
            
            if imputer is not None:
                X_treino, X_teste = imputer(X_treino, X_teste)
                
            if encoder is not None:
                X_treino, X_teste = encoder(X_treino, X_teste)
                
            X_treino = X_treino.values
            X_teste = X_teste.values
            
        else:
            X_treino = X[treino_idx]
            X_teste = X[teste_idx]
        
        y_treino = y[treino_idx]
        y_teste = y[teste_idx]

        X_treino, X_val, y_treino, y_val = train_test_split(X_treino, y_treino, stratify=y_treino, test_size=0.2, random_state=1)

        ss = StandardScaler()
        ss.fit(X_treino)
        X_treino = ss.transform(X_treino)
        X_teste = ss.transform(X_teste)
        X_val = ss.transform(X_val)

        knn, _, _ = selecionar_melhor_k_knn(ks, X_treino, X_val, y_treino, y_val)
        pred = knn.predict(X_teste)

        acuracias.append(accuracy_score(y_teste, pred))
        
        pgb.update(1)
        
    pgb.close()
    
    return acuracias

In [41]:
def imputar_dados(X_treino, X_teste):
    atrib_numericos = ['promotion_last_5years']
    atrib_categoricos = ['department', 'salary']

    transformers = [
        ('imp_num', SimpleImputer(strategy='most_frequent'), atrib_numericos),
        ('imp_cat', SimpleImputer(strategy='constant', 
                                  fill_value='desconhecido'),
                                  atrib_categoricos)
    ]

    ct_imp = ColumnTransformer(
        transformers
    )
    X_treino = ct_imp.fit_transform(X_treino)
    X_teste = ct_imp.transform(X_teste)
    
    df_x_treino = pd.DataFrame(X_treino, columns=[*atrib_numericos, *atrib_categoricos])
    df_x_teste = pd.DataFrame(X_teste, columns=[*atrib_numericos, *atrib_categoricos])
    
    return df_x_treino, df_x_teste

In [49]:
def codificar_colunas(X_treino, X_teste):
    
    transformers = [
        ('oe_department', OneHotEncoder(), ['department'] ),
        ('oe_salary', OrdinalEncoder(), ['salary'])
    ]
    
    ct_ohoe = ColumnTransformer(transformers, remainder='passthrough')
    X_treino = ct_ohoe.fit_transform(X_treino).todense()
    X_teste = ct_ohoe.transform(X_teste).todense()
    
    df_x_treino = pd.DataFrame(X_treino, columns=[*ct_ohoe.transformers_[0][1].get_feature_names(), 
                                                  'salary', 
                                                  'promotion_last_5years'])
    df_x_teste = pd.DataFrame(X_teste, columns=[*ct_ohoe.transformers_[0][1].get_feature_names(), 
                                                  'salary', 
                                                  'promotion_last_5years'])    
    
    return df_x_treino, df_x_teste

In [51]:
accs_knn = do_cv_knn(X, y, 10, range(1,20,2), imputar_dados, codificar_colunas)
imprimir_estatisticas(accs_knn)

Folds avaliados:   0%|          | 0/10 [00:00<?, ?it/s]



Resultados: 0.76 +- 0.00, min: 0.75, max: 0.76
