# Handling missing data with pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.Series([1, None, 'x', None, 200])
df

0       1
1    None
2       x
3    None
4     200
dtype: object

In [3]:
#return a boolean series where 'True' indicates a missing value
df.isnull()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [4]:
#count null values 
df.isnull().sum()

2

In [5]:
#Equivalent to df[df.notnull()] - returns a view. Use the 'inplace' parameter to modify the original dataframe.
df.dropna()

0      1
2      x
4    200
dtype: object

### Only drop rows where every cell is NaN

In [6]:
#Creating a dataframe with rows that have varying null data
df2 = df*2
df3 = pd.DataFrame([df,df2]).T
df3.ix[1,1] = 'filler'
df3

Unnamed: 0,0,1
0,1,2
1,,filler
2,x,xx
3,,
4,200,400


In [7]:
"""Columns can be dropped the same way by passing 'axis = 1' to the dropna method.
    Notice only the row containing all 'NaN' values is dropped. 
    """
df3.dropna(how='all')

Unnamed: 0,0,1
0,1,2
1,,filler
2,x,xx
4,200,400


### Dropping rows that contain less than a certain amount of NaN cells

In [8]:
df = pd.DataFrame(np.random.randn(5,4))
df.ix[:3,1] = np.nan; df.ix[:2,2] = None
df

Unnamed: 0,0,1,2,3
0,1.175061,,,1.567869
1,-1.55528,,,0.443558
2,-0.384153,,,0.261936
3,0.681745,,-2.08523,1.355308
4,-1.073848,-2.330917,0.301831,-0.478177


In [9]:
#Set a threshold and drop by row - returns a view (the underlying dataframe is unchanged)
df.dropna(thresh=3)

Unnamed: 0,0,1,2,3
3,0.681745,,-2.08523,1.355308
4,-1.073848,-2.330917,0.301831,-0.478177


In [10]:
#Drop by column - returns a view 
df.dropna(thresh=2, axis=1)

Unnamed: 0,0,2,3
0,1.175061,,1.567869
1,-1.55528,,0.443558
2,-0.384153,,0.261936
3,0.681745,-2.08523,1.355308
4,-1.073848,0.301831,-0.478177


### Filling missing data

In [11]:
#Returns a copy of the dataframe with NaN (not a number) or NaT (not a time) values replaced with chosen values. 
df.fillna(0)

Unnamed: 0,0,1,2,3
0,1.175061,0.0,0.0,1.567869
1,-1.55528,0.0,0.0,0.443558
2,-0.384153,0.0,0.0,0.261936
3,0.681745,0.0,-2.08523,1.355308
4,-1.073848,-2.330917,0.301831,-0.478177


In [12]:
#Passing a dict to fillna allows columns to be treated individually
df.fillna({0:1, 1:'x', 2:'y'})
#Not currently supported for filling row by row with a dictionary

Unnamed: 0,0,1,2,3
0,1.175061,x,y,1.567869
1,-1.55528,x,y,0.443558
2,-0.384153,x,y,0.261936
3,0.681745,x,-2.08523,1.355308
4,-1.073848,-2.33092,0.301831,-0.478177


In [13]:
#bfill (backfill) method replaces NaN values with the next non-NaN value in the same column. ffill also available. 
df.fillna(method='bfill')

Unnamed: 0,0,1,2,3
0,1.175061,-2.330917,-2.08523,1.567869
1,-1.55528,-2.330917,-2.08523,0.443558
2,-0.384153,-2.330917,-2.08523,0.261936
3,0.681745,-2.330917,-2.08523,1.355308
4,-1.073848,-2.330917,0.301831,-0.478177


In [14]:
#fill across rows
df.fillna(method='bfill', axis=1)

Unnamed: 0,0,1,2,3
0,1.175061,1.567869,1.567869,1.567869
1,-1.55528,0.443558,0.443558,0.443558
2,-0.384153,0.261936,0.261936,0.261936
3,0.681745,-2.08523,-2.08523,1.355308
4,-1.073848,-2.330917,0.301831,-0.478177


In [15]:
#restrict number of values to be filled
df.fillna(method='bfill', limit=3)

Unnamed: 0,0,1,2,3
0,1.175061,,-2.08523,1.567869
1,-1.55528,-2.330917,-2.08523,0.443558
2,-0.384153,-2.330917,-2.08523,0.261936
3,0.681745,-2.330917,-2.08523,1.355308
4,-1.073848,-2.330917,0.301831,-0.478177


In [16]:
#Use the mean of each individual column to fill the NaN's in that column.
df.fillna(df.mean())


Unnamed: 0,0,1,2,3
0,1.175061,-2.330917,-0.8917,1.567869
1,-1.55528,-2.330917,-0.8917,0.443558
2,-0.384153,-2.330917,-0.8917,0.261936
3,0.681745,-2.330917,-2.08523,1.355308
4,-1.073848,-2.330917,0.301831,-0.478177


In [17]:
df.mean()

0   -0.231295
1   -2.330917
2   -0.891700
3    0.630099
dtype: float64

In [18]:
#stacking a dataframe creates a single column
df.stack().mean()

-0.16309970584446087

In [19]:
#fill using the mean of the whole dataframe
fill = df.stack().mean()
df.fillna(fill)

Unnamed: 0,0,1,2,3
0,1.175061,-0.1631,-0.1631,1.567869
1,-1.55528,-0.1631,-0.1631,0.443558
2,-0.384153,-0.1631,-0.1631,0.261936
3,0.681745,-0.1631,-2.08523,1.355308
4,-1.073848,-2.330917,0.301831,-0.478177
