# Handling missing data

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

- Sentinel value

In [2]:
float_data = pd.Series([1.2,-3.5,np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [3]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data = pd.Series(['a', np.nan, None, 'b'])
string_data

0       a
1     NaN
2    None
3       b
dtype: object

In [5]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [6]:
float_data = pd.Series([1,2,None], dtype='float64')
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [7]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

- Filtering out missing data

In [8]:
data = pd.Series([1,np.nan,3.5, np.nan, 7])

In [9]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

> This is simalar as doing `data[data.notna()]`

In [10]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data = pd.DataFrame([[1.,6.5,3.],[1.,np.nan, np.nan,], [np.nan, np.nan,np.nan], [np.nan, 6.5,3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [13]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [15]:
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [16]:
data.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


If we want to keep only rows containing at most a certain number of missing observations, we can indicate this with the `thresh` argument.

In [18]:
df = pd.DataFrame(np.random.standard_normal((7,3)))
df

Unnamed: 0,0,1,2
0,0.4529,-0.321811,-0.244244
1,2.834513,-0.195621,-2.185212
2,0.710244,1.8122,-0.354772
3,-0.481513,-0.161528,0.039036
4,1.420123,0.647147,0.308608
5,0.732598,0.851325,-0.600215
6,0.46454,0.054202,-0.973393


In [21]:
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.4529,,
1,2.834513,,
2,0.710244,,-0.354772
3,-0.481513,,0.039036
4,1.420123,0.647147,0.308608
5,0.732598,0.851325,-0.600215
6,0.46454,0.054202,-0.973393


In [25]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.710244,,-0.354772
3,-0.481513,,0.039036
4,1.420123,0.647147,0.308608
5,0.732598,0.851325,-0.600215
6,0.46454,0.054202,-0.973393


## Filling in Missing Data

In [26]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.4529,0.0,0.0
1,2.834513,0.0,0.0
2,0.710244,0.0,-0.354772
3,-0.481513,0.0,0.039036
4,1.420123,0.647147,0.308608
5,0.732598,0.851325,-0.600215
6,0.46454,0.054202,-0.973393


Calling `fillna` with a dictionary, we can use a different fill value for each column

In [27]:
df.fillna({1:0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.4529,0.5,0.0
1,2.834513,0.5,0.0
2,0.710244,0.5,-0.354772
3,-0.481513,0.5,0.039036
4,1.420123,0.647147,0.308608
5,0.732598,0.851325,-0.600215
6,0.46454,0.054202,-0.973393


The same interpolation methods available for reindexing can be used with `fillna`

In [30]:
df = pd.DataFrame(np.random.standard_normal((6,3 )))
df.iloc[2:,1] =np.nan
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,1.915151,0.603864,-1.001472
1,0.245587,1.088531,0.350177
2,-0.841001,,-0.670099
3,0.772837,,1.069092
4,0.701028,,
5,0.807389,,


In [31]:
df.fillna(method='ffill')

  df.fillna(method='ffill')


Unnamed: 0,0,1,2
0,1.915151,0.603864,-1.001472
1,0.245587,1.088531,0.350177
2,-0.841001,1.088531,-0.670099
3,0.772837,1.088531,1.069092
4,0.701028,1.088531,1.069092
5,0.807389,1.088531,1.069092


In [32]:
df.fillna(method='ffill', limit=2)

  df.fillna(method='ffill', limit=2)


Unnamed: 0,0,1,2
0,1.915151,0.603864,-1.001472
1,0.245587,1.088531,0.350177
2,-0.841001,1.088531,-0.670099
3,0.772837,1.088531,1.069092
4,0.701028,,1.069092
5,0.807389,,1.069092


With `fillna` we can do lots of other things such as simple data imputation using the median or mean statistics.

In [33]:
data = pd.Series([1.,np.nan, 3.5, np.nan, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64