# Data Preparation Basics

## 2. Treating missing values

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

### Figuring out what data is missing

In [3]:
missing = np.nan

series_obj = Series(['row 1', 'row 2', missing, 'row 4', 'row 5', missing, 'row 7', missing, 'row 9'])
series_obj

0    row 1
1    row 2
2      NaN
3    row 4
4    row 5
5      NaN
6    row 7
7      NaN
8    row 9
dtype: object

In [4]:
series_obj.isnull()

0    False
1    False
2     True
3    False
4    False
5     True
6    False
7     True
8    False
dtype: bool

### Filling in for missing values

In [5]:
np.random.seed(24)
DF_obj = DataFrame(np.random.rand(36).reshape(6, 6))
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.960017,0.699512,0.999867,0.220067,0.361056,0.739841
1,0.996456,0.316347,0.136545,0.38398,0.320519,0.366415
2,0.709652,0.900142,0.534115,0.247294,0.671807,0.561729
3,0.54256,0.893448,0.84278,0.306013,0.63117,0.680239
4,0.970428,0.893567,0.942426,0.642225,0.614648,0.227683
5,0.486032,0.807219,0.84422,0.534681,0.757798,0.499677


In [8]:
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.960017,0.699512,0.999867,0.220067,0.361056,0.739841
1,0.996456,0.316347,0.136545,0.38398,0.320519,
2,0.709652,0.900142,0.534115,0.247294,0.671807,
3,,0.893448,0.84278,0.306013,0.63117,
4,,0.893567,0.942426,0.642225,0.614648,
5,,0.807219,0.84422,0.534681,0.757798,0.499677


In [9]:
filled_DF = DF_obj.fillna(1)
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.960017,0.699512,0.999867,0.220067,0.361056,0.739841
1,0.996456,0.316347,0.136545,0.38398,0.320519,1.0
2,0.709652,0.900142,0.534115,0.247294,0.671807,1.0
3,1.0,0.893448,0.84278,0.306013,0.63117,1.0
4,1.0,0.893567,0.942426,0.642225,0.614648,1.0
5,1.0,0.807219,0.84422,0.534681,0.757798,0.499677


In [12]:
filled_DF = DF_obj.fillna({0:0.1, 5:1.25})
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.960017,0.699512,0.999867,0.220067,0.361056,0.739841
1,0.996456,0.316347,0.136545,0.38398,0.320519,1.25
2,0.709652,0.900142,0.534115,0.247294,0.671807,1.25
3,0.1,0.893448,0.84278,0.306013,0.63117,1.25
4,0.1,0.893567,0.942426,0.642225,0.614648,1.25
5,0.1,0.807219,0.84422,0.534681,0.757798,0.499677


In [14]:
# Using last non-null element in column
fill_DF = DF_obj.fillna(method='ffill')
fill_DF

Unnamed: 0,0,1,2,3,4,5
0,0.960017,0.699512,0.999867,0.220067,0.361056,0.739841
1,0.996456,0.316347,0.136545,0.38398,0.320519,0.739841
2,0.709652,0.900142,0.534115,0.247294,0.671807,0.739841
3,0.709652,0.893448,0.84278,0.306013,0.63117,0.739841
4,0.709652,0.893567,0.942426,0.642225,0.614648,0.739841
5,0.709652,0.807219,0.84422,0.534681,0.757798,0.499677


### Counting missing values

In [15]:
np.random.seed(24)
DF_obj = DataFrame(np.random.rand(36).reshape(6, 6))
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.960017,0.699512,0.999867,0.220067,0.361056,0.739841
1,0.996456,0.316347,0.136545,0.38398,0.320519,
2,0.709652,0.900142,0.534115,0.247294,0.671807,
3,,0.893448,0.84278,0.306013,0.63117,
4,,0.893567,0.942426,0.642225,0.614648,
5,,0.807219,0.84422,0.534681,0.757798,0.499677


In [17]:
DF_obj.isnull().sum()

0    3
1    0
2    0
3    0
4    0
5    4
dtype: int64

### Filtering out missing values

In [19]:
# Dropping the row
DF_no_NaN = DF_obj.dropna()
DF_no_NaN

Unnamed: 0,0,1,2,3,4,5
0,0.960017,0.699512,0.999867,0.220067,0.361056,0.739841


In [20]:
# Dropping the column
DF_no_NaN = DF_obj.dropna(axis=1)
DF_no_NaN

Unnamed: 0,1,2,3,4
0,0.699512,0.999867,0.220067,0.361056
1,0.316347,0.136545,0.38398,0.320519
2,0.900142,0.534115,0.247294,0.671807
3,0.893448,0.84278,0.306013,0.63117
4,0.893567,0.942426,0.642225,0.614648
5,0.807219,0.84422,0.534681,0.757798
