# Missing Data

Let's show a few convenient methods to deal with Missing Data in pandas:

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

In [3]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [4]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [5]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [6]:
df.fillna(value='FILL VALUE')

Unnamed: 0,A,B,C
0,1,5,1
1,2,FILL VALUE,2
2,FILL VALUE,FILL VALUE,3


In [7]:
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

## Handling Missing Data

In [8]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [9]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering Out Missing Data

In [13]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [20]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [25]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [26]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [27]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [28]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### Filling In Missing Data

In [38]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.304925,0.0,0.0
1,-1.738484,0.0,0.0
2,0.190502,0.0,1.501574
3,-0.31419,0.0,-0.328648
4,-0.377795,1.510117,-1.111623
5,-0.377861,0.181097,0.416388
6,0.250496,-1.295044,-0.725308


In [39]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.304925,0.5,0.0
1,-1.738484,0.5,0.0
2,0.190502,0.5,1.501574
3,-0.31419,0.5,-0.328648
4,-0.377795,1.510117,-1.111623
5,-0.377861,0.181097,0.416388
6,0.250496,-1.295044,-0.725308


In [40]:
df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.304925,0.0,0.0
1,-1.738484,0.0,0.0
2,0.190502,0.0,1.501574
3,-0.31419,0.0,-0.328648
4,-0.377795,1.510117,-1.111623
5,-0.377861,0.181097,0.416388
6,0.250496,-1.295044,-0.725308


In [41]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.956182,-0.757367,0.377274
1,-0.465264,0.436048,0.274437
2,1.280614,,-0.228193
3,1.011925,,1.654091
4,1.651795,,
5,0.416894,,


In [42]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.956182,-0.757367,0.377274
1,-0.465264,0.436048,0.274437
2,1.280614,0.436048,-0.228193
3,1.011925,0.436048,1.654091
4,1.651795,0.436048,1.654091
5,0.416894,0.436048,1.654091


In [43]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.956182,-0.757367,0.377274
1,-0.465264,0.436048,0.274437
2,1.280614,0.436048,-0.228193
3,1.011925,0.436048,1.654091
4,1.651795,,1.654091
5,0.416894,,1.654091


## Data Transformation

### Removing Duplicates

In [45]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [46]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [47]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [48]:
data['v1'] = range(7)
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [49]:
data.drop_duplicates(['k1', 'k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
