## Handing Missing Data

In [3]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [4]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [5]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [6]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
string_data[0]

'aardvark'

In [8]:
string_data[0] = None

In [9]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering Out Missing Data

In [10]:
from numpy import nan as NA

In [11]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [12]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
data = pd.DataFrame([[1, 6.5, 3], [1, NA, NA], [NA, NA, NA], [NA, 6.5, 3]])

In [15]:
cleaned = data.dropna()

In [16]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [18]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [19]:
data[4] = NA

In [20]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [21]:
data.dropna(how='all', axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [22]:
df = pd.DataFrame(np.random.randn(7,3))

In [23]:
df.iloc[:4, 1] = NA

In [24]:
df.iloc[:2, 2] = NA

In [25]:
df

Unnamed: 0,0,1,2
0,0.215566,,
1,-1.366444,,
2,1.003365,,0.636395
3,-0.736199,,-0.720275
4,-1.137064,1.357403,-2.956059
5,0.483299,1.04796,1.673634
6,-1.653439,-0.698233,0.897256


In [26]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.137064,1.357403,-2.956059
5,0.483299,1.04796,1.673634
6,-1.653439,-0.698233,0.897256


In [27]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,1.003365,,0.636395
3,-0.736199,,-0.720275
4,-1.137064,1.357403,-2.956059
5,0.483299,1.04796,1.673634
6,-1.653439,-0.698233,0.897256


### Filling In Missing Data|

In [29]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.215566,0.0,0.0
1,-1.366444,0.0,0.0
2,1.003365,0.0,0.636395
3,-0.736199,0.0,-0.720275
4,-1.137064,1.357403,-2.956059
5,0.483299,1.04796,1.673634
6,-1.653439,-0.698233,0.897256


fill different value for each column by passing a dict

In [30]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.215566,0.5,0.0
1,-1.366444,0.5,0.0
2,1.003365,0.5,0.636395
3,-0.736199,0.5,-0.720275
4,-1.137064,1.357403,-2.956059
5,0.483299,1.04796,1.673634
6,-1.653439,-0.698233,0.897256


In [31]:
_ = df.fillna(0, inplace=True)

In [32]:
df

Unnamed: 0,0,1,2
0,0.215566,0.0,0.0
1,-1.366444,0.0,0.0
2,1.003365,0.0,0.636395
3,-0.736199,0.0,-0.720275
4,-1.137064,1.357403,-2.956059
5,0.483299,1.04796,1.673634
6,-1.653439,-0.698233,0.897256


In [33]:
df = pd.DataFrame(np.random.randn(6,3))

In [34]:
df.iloc[2:, 1] = NA

In [35]:
df.iloc[4:, 2] = NA

In [36]:
df

Unnamed: 0,0,1,2
0,0.305949,0.978144,-0.736454
1,0.705077,-1.740715,0.47878
2,-1.626643,,-0.713281
3,2.449266,,0.787246
4,-0.714598,,
5,0.339834,,


In [37]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.305949,0.978144,-0.736454
1,0.705077,-1.740715,0.47878
2,-1.626643,-1.740715,-0.713281
3,2.449266,-1.740715,0.787246
4,-0.714598,-1.740715,0.787246
5,0.339834,-1.740715,0.787246


In [38]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.305949,0.978144,-0.736454
1,0.705077,-1.740715,0.47878
2,-1.626643,-1.740715,-0.713281
3,2.449266,-1.740715,0.787246
4,-0.714598,,0.787246
5,0.339834,,0.787246


In [39]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [40]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64