# Cleaning Data - Casting Datatypes and Handling Missing Values 

In [5]:
import pandas as pd
import numpy as np

In [8]:
people = {
    'first': ['Corey' , 'Jane' , 'John' , 'Chris' , np.nan ,None , 'NA'],
    'last': ['Schafer' , 'Doe' , 'Doe' , 'Schafer' , np.nan , np.nan , 'Missing'],
    'email': ['coreyMSchafer@gmail.com' , 'JaneDoe@gmail.com', 'JohnDoe@gmail.com' , None, np.nan ,'Anonymous@gmail.com','NA' ],
    'age': ['33' , '55' ,'63' , '36' , None , None , 'Missing']
}

In [9]:
df = pd.DataFrame(people)

In [10]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@gmail.com,
6,,Missing,,Missing


In [11]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
6,,Missing,,Missing


In [None]:
df.dropna(axis='index' , how='any')
# Here basically how is the criteria of dropping but by default it is any

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
6,,Missing,,Missing


In [None]:
df.dropna(axis='index' , how='all')
# all is used when we want that the row which is not having any data must be deleted 
# if we change column instead of index then column will be deleted instead of rows

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,,36
5,,,Anonymous@gmail.com,
6,,Missing,,Missing


In [None]:
df.dropna(axis='columns' , how='all')
# here is no single column that contains all the missing values so nothing is removed

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@gmail.com,
6,,Missing,,Missing


In [16]:
df.dropna(axis='columns' , how='any')

0
1
2
3
4
5
6


In [19]:
# Here now what we need to do is that the people who dont have email addresses we want to drop them out 
df.dropna(axis='index' , how='any', subset=['email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
5,,,Anonymous@gmail.com,
6,,Missing,,Missing


In [21]:
df.dropna(axis='index' , how='all', subset=['last','email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,coreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@gmail.com,55
2,John,Doe,JohnDoe@gmail.com,63
3,Chris,Schafer,,36
5,,,Anonymous@gmail.com,
6,,Missing,,Missing
