# Working with Missing Data in Pandas

In [1]:
import numpy as np
import pandas as pd

from pandas import DataFrame

##Filling missing values using fillna(), replace() and interpolate()

In [2]:
data = {'names':['steve','john','richard','sarah', 'randy','micheal','julie'], 'age':[20,22,20,21,24,23,22],'gender':['male','male','male','female','male','male','female'],
        'rank':[2,1,4,5,3,7,6]}

ranking_df = DataFrame(data)
ranking_df.iloc[2:5,1] = np.nan
ranking_df.iloc[3:6,3] = np.nan
ranking_df.iloc[3,:] = np.nan
ranking_df

Unnamed: 0,names,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
2,richard,,male,4.0
3,,,,
4,randy,,male,
5,micheal,23.0,male,
6,julie,22.0,female,6.0


In [4]:
#First step to see is if there are any missing values in the dataset , two ways are isnull nad notnull
ranking_df.isnull()


Unnamed: 0,names,age,gender,rank
0,False,False,False,False
1,False,False,False,False
2,False,True,False,False
3,True,True,True,True
4,False,True,False,True
5,False,False,False,True
6,False,False,False,False


In [5]:
ranking_df.notnull()

Unnamed: 0,names,age,gender,rank
0,True,True,True,True
1,True,True,True,True
2,True,False,True,True
3,False,False,False,False
4,True,False,True,False
5,True,True,True,False
6,True,True,True,True


In [6]:
#Boolean masking
bool_series = pd.isnull(ranking_df['age'])
ranking_df[bool_series]

Unnamed: 0,names,age,gender,rank
2,richard,,male,4.0
3,,,,
4,randy,,male,


In [7]:
#Filling missing value by single values

ranking_df.fillna(0)

Unnamed: 0,names,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
2,richard,0.0,male,4.0
3,0,0.0,0,0.0
4,randy,0.0,male,0.0
5,micheal,23.0,male,0.0
6,julie,22.0,female,6.0


In [10]:
#fills missing values in ranking_df with the values from the previous row.
ranking_df.fillna(method='pad')

  ranking_df.fillna(method='pad')


Unnamed: 0,names,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
2,richard,22.0,male,4.0
3,richard,22.0,male,4.0
4,randy,22.0,male,4.0
5,micheal,23.0,male,4.0
6,julie,22.0,female,6.0


In [13]:
#fills missing values in ranking_df with the values from the next row.
ranking_df.fillna(method='bfill')

  ranking_df.fillna(method='bfill')


Unnamed: 0,names,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
2,richard,23.0,male,4.0
3,randy,23.0,male,6.0
4,randy,23.0,male,6.0
5,micheal,23.0,male,6.0
6,julie,22.0,female,6.0


In [14]:
#will fill the missing values in each column by calculating a value that falls on a straight line between the non-missing values on either side of the missing data point.
ranking_df.interpolate(method='linear')

  ranking_df.interpolate(method='linear')


Unnamed: 0,names,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
2,richard,22.25,male,4.0
3,,22.5,,4.5
4,randy,22.75,male,5.0
5,micheal,23.0,male,5.5
6,julie,22.0,female,6.0


In [16]:
#This code removes rows from ranking_df that contain any missing values.
ranking_df.dropna()

Unnamed: 0,names,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
6,julie,22.0,female,6.0


In [18]:
#This code removes rows from ranking_df only if all values in the row are missing.
ranking_df.dropna(how='all')

Unnamed: 0,names,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
2,richard,,male,4.0
4,randy,,male,
5,micheal,23.0,male,
6,julie,22.0,female,6.0


In [20]:
#This code removes columns from ranking_df that contain any missing values.
ranking_df.dropna(axis=1)

0
1
2
3
4
5
6


In [21]:
ranking_df.dropna(axis=0)

Unnamed: 0,names,age,gender,rank
0,steve,20.0,male,2.0
1,john,22.0,male,1.0
6,julie,22.0,female,6.0
