# Pandas data cleaning

In [32]:
import pandas as pd
import numpy as np

In [33]:
people = {
    "Name" : ['Juan', np.nan, 'Antonio', 'Maria', 'Lucia', np.nan],
    "Surname" : ['Rodrigez', np.nan, 'Perez', 'Rivera', 'Grande', np.nan],
    "Age": [23, 32, 36, None, 32, np.nan],
    "Email": ['JuanR@mail.com', None, 'A.Delfin@mail.com', 'MariaRivera@mail.com', 'Lucia.G@mail.com', np.nan],
    "Job": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
}

df = pd.DataFrame(people)

In [34]:
df

Unnamed: 0,Name,Surname,Age,Email,Job
0,Juan,Rodrigez,23.0,JuanR@mail.com,
1,,,32.0,,
2,Antonio,Perez,36.0,A.Delfin@mail.com,
3,Maria,Rivera,,MariaRivera@mail.com,
4,Lucia,Grande,32.0,Lucia.G@mail.com,
5,,,,,


In [35]:
df.dropna()

Unnamed: 0,Name,Surname,Age,Email,Job


In [36]:
# Default settings of dropna()
df.dropna(axis='index', how='any')

Unnamed: 0,Name,Surname,Age,Email,Job


In [37]:
df.dropna(axis='index', how='all')

Unnamed: 0,Name,Surname,Age,Email,Job
0,Juan,Rodrigez,23.0,JuanR@mail.com,
1,,,32.0,,
2,Antonio,Perez,36.0,A.Delfin@mail.com,
3,Maria,Rivera,,MariaRivera@mail.com,
4,Lucia,Grande,32.0,Lucia.G@mail.com,


In [39]:
df.dropna(axis='columns', how='all')

Unnamed: 0,Name,Surname,Age,Email
0,Juan,Rodrigez,23.0,JuanR@mail.com
1,,,32.0,
2,Antonio,Perez,36.0,A.Delfin@mail.com
3,Maria,Rivera,,MariaRivera@mail.com
4,Lucia,Grande,32.0,Lucia.G@mail.com
5,,,,


In [40]:
# Only drop entries with no defined Email
df.dropna(axis='index', how='all', subset='Email')

Unnamed: 0,Name,Surname,Age,Email,Job
0,Juan,Rodrigez,23.0,JuanR@mail.com,
2,Antonio,Perez,36.0,A.Delfin@mail.com,
3,Maria,Rivera,,MariaRivera@mail.com,
4,Lucia,Grande,32.0,Lucia.G@mail.com,


In [41]:
# Tells if a field is consider NA
df.isna()

Unnamed: 0,Name,Surname,Age,Email,Job
0,False,False,False,False,True
1,True,True,False,True,True
2,False,False,False,False,True
3,False,False,True,False,True
4,False,False,False,False,True
5,True,True,True,True,True


In [42]:
df.fillna("MISSING")

Unnamed: 0,Name,Surname,Age,Email,Job
0,Juan,Rodrigez,23.0,JuanR@mail.com,MISSING
1,MISSING,MISSING,32.0,MISSING,MISSING
2,Antonio,Perez,36.0,A.Delfin@mail.com,MISSING
3,Maria,Rivera,MISSING,MariaRivera@mail.com,MISSING
4,Lucia,Grande,32.0,Lucia.G@mail.com,MISSING
5,MISSING,MISSING,MISSING,MISSING,MISSING


In [43]:
df.dtypes

Name        object
Surname     object
Age        float64
Email       object
Job        float64
dtype: object

In [44]:
# In order to cast to a Int there should be no missing values on the column
df['Age'].astype(str)

0    23.0
1    32.0
2    36.0
3     nan
4    32.0
5     nan
Name: Age, dtype: object