# 🩹📄 Tratamento de Dados Faltantes

In [1]:
import pandas as pd

In [2]:
dados = pd.read_csv('train.csv')

In [3]:
# Criando 3 cópias do nosso dataset
dados1 = dados.copy()
dados2 = dados.copy()
dados3 = dados.copy()

In [4]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
# Verificando a quantidade de dados faltantes por coluna
dados.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
dados.shape

(891, 12)

In [7]:
# Porcentagens de dados faltantes por coluna
dados.isnull().sum() / dados.shape[0] * 100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [8]:
# Exemplo: Dados com idades nulas
dados.loc[dados['Age'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7225.0000,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7225.0000,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [9]:
# Exemplo: Dados com cabines nulas
dados.loc[dados['Cabin'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7925.0000,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21075.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29125.0000,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


- O método ``dropna(inplace= True)`` apaga todos os dados nulos.
- __OBSERVAÇÃO:__ cuidado ao utilizar esse método, pois pode ocorrer de perder dados importantes!

In [10]:
# Tomar cuidado ao utilizar o método dropna(inplace= True)
# Para mostrar a utilização desse método utilizaremos a variável dados1
dados1.dropna(inplace = True)

In [11]:
# Agora observe quando dados foram perdidos ao utilizar esse método:
dados1.shape

(183, 12)

In [12]:
dados1.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

- Agora conheceremos o método ``fillna()``.

In [13]:
# mean: método para realizar média
media_idade = dados2['Age'].mean()

print(media_idade)

29.69911764705882


In [14]:
# Substituindo os valores vazios pela média aproximada. 
# Arredondaremos para 30 anos.
dados2.fillna({'Age': 30}, inplace=True)

In [15]:
dados2.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

- Vamos aprender o método ``ffil()``.
- Esse método atribui o valor do último dado válido para o dado faltante.

In [16]:
dados3['Embarked'] = dados3['Embarked'].ffill()

In [17]:
dados3.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

- Vamos agora conhecer o método ``drop()``.

In [18]:
dados3.drop(['Cabin'], axis = 1, inplace = True) # (axis = 0 -> linhas, axis = 1 -> coluna inteira)

In [19]:
dados3.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         0
dtype: int64

In [20]:
dados3.fillna({'Age': 30}, inplace=True)

In [21]:
dados3.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [22]:
dados3.shape

(891, 11)