# Imputers

Para trabalhar com valores nulos utilizaremos os Imputers

Há dois tipos de inputers:

- univariados: analisa apenas um atributo (coluna/campo) para decidir como irá preencher os dados nulos;
- multivariados: analisa todo o conjunto de dados para definição de preenchimento de valores nulos.

In [1]:
# Importando biblioteca
import pandas as pd
from sklearn.impute import SimpleImputer

In [2]:
# Criando um DataFrame
df = pd.DataFrame({
    'idade': [14,25,23,31,None,18,20,15,17,60,31]
})

In [3]:
df

Unnamed: 0,idade
0,14.0
1,25.0
2,23.0
3,31.0
4,
5,18.0
6,20.0
7,15.0
8,17.0
9,60.0


## Constant Imputer

In [4]:
# Criando um imputer constant
constant_imputer = SimpleImputer(strategy='constant', fill_value=-1)
constant_imputer

In [5]:
# Treinamento do constant_imputer com o df
constant_imputer.fit(df)

In [6]:
# Após o modelo ser treinado, vamos transformar os dados
constant_imputer.transform(df)

array([[14.],
       [25.],
       [23.],
       [31.],
       [-1.],
       [18.],
       [20.],
       [15.],
       [17.],
       [60.],
       [31.]])

In [7]:
# Criando um novo campo no df com os resultados acima
df['constant'] = constant_imputer.transform(df)
df

Unnamed: 0,idade,constant
0,14.0,14.0
1,25.0,25.0
2,23.0,23.0
3,31.0,31.0
4,,-1.0
5,18.0,18.0
6,20.0,20.0
7,15.0,15.0
8,17.0,17.0
9,60.0,60.0


In [8]:
# Outra forma de substituição com o Pandas
df_pandas = pd.DataFrame({
    'idade': df['idade'].copy()
})

df_pandas

Unnamed: 0,idade
0,14.0
1,25.0
2,23.0
3,31.0
4,
5,18.0
6,20.0
7,15.0
8,17.0
9,60.0


In [9]:
constant = -1
df_pandas[df_pandas.isnull()] = constant
df_pandas

Unnamed: 0,idade
0,14.0
1,25.0
2,23.0
3,31.0
4,-1.0
5,18.0
6,20.0
7,15.0
8,17.0
9,60.0


## Mean Imputer

In [10]:
df2 = pd.DataFrame({
    'idade': df['idade'].copy()
})

df2

Unnamed: 0,idade
0,14.0
1,25.0
2,23.0
3,31.0
4,
5,18.0
6,20.0
7,15.0
8,17.0
9,60.0


In [11]:
# Criando o imputer mean
mean_imputer = SimpleImputer(strategy='mean')
mean_imputer

In [12]:
# Treinamento
mean_imputer.fit(df2)

In [13]:
# Transformação
mean_imputer.transform(df2)

array([[14. ],
       [25. ],
       [23. ],
       [31. ],
       [25.4],
       [18. ],
       [20. ],
       [15. ],
       [17. ],
       [60. ],
       [31. ]])

In [14]:
# Adicionando uma coluna no df
df['mean'] = mean_imputer.transform(df2)
df

Unnamed: 0,idade,constant,mean
0,14.0,14.0,14.0
1,25.0,25.0,25.0
2,23.0,23.0,23.0
3,31.0,31.0,31.0
4,,-1.0,25.4
5,18.0,18.0,18.0
6,20.0,20.0,20.0
7,15.0,15.0,15.0
8,17.0,17.0,17.0
9,60.0,60.0,60.0


## Median Imputer

In [15]:
median_imputer = SimpleImputer(strategy='median')
median_imputer

In [16]:
# Treinamento do modelo
median_imputer.fit(df2)
median_imputer

In [17]:
# Transformação
median_imputer.transform(df2)

array([[14. ],
       [25. ],
       [23. ],
       [31. ],
       [21.5],
       [18. ],
       [20. ],
       [15. ],
       [17. ],
       [60. ],
       [31. ]])

In [18]:
df['median'] = median_imputer.transform(df2)
df

Unnamed: 0,idade,constant,mean,median
0,14.0,14.0,14.0,14.0
1,25.0,25.0,25.0,25.0
2,23.0,23.0,23.0,23.0
3,31.0,31.0,31.0,31.0
4,,-1.0,25.4,21.5
5,18.0,18.0,18.0,18.0
6,20.0,20.0,20.0,20.0
7,15.0,15.0,15.0,15.0
8,17.0,17.0,17.0,17.0
9,60.0,60.0,60.0,60.0


## Mode Imputer

In [19]:
mode_imputer = SimpleImputer(strategy='most_frequent')
mode_imputer

In [20]:
mode_imputer.fit(df2)

In [21]:
mode_imputer.transform(df2)

array([[14.],
       [25.],
       [23.],
       [31.],
       [31.],
       [18.],
       [20.],
       [15.],
       [17.],
       [60.],
       [31.]])

In [22]:
df['mode'] = mode_imputer.transform(df2)
df

Unnamed: 0,idade,constant,mean,median,mode
0,14.0,14.0,14.0,14.0,14.0
1,25.0,25.0,25.0,25.0,25.0
2,23.0,23.0,23.0,23.0,23.0
3,31.0,31.0,31.0,31.0,31.0
4,,-1.0,25.4,21.5,31.0
5,18.0,18.0,18.0,18.0,18.0
6,20.0,20.0,20.0,20.0,20.0
7,15.0,15.0,15.0,15.0,15.0
8,17.0,17.0,17.0,17.0,17.0
9,60.0,60.0,60.0,60.0,60.0
