# Dealing with missing data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    'A' : [1.0, 5.0, 10.0],
    'B' : [2.0, 6.0, 11.0],
    'C' : [3.0, np.nan, 12.0],
    'D' : [4.0, 8.0, np.nan]
}

In [4]:
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [7]:
## checking missing data
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [9]:
## droping nan
df.dropna(axis = 0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [11]:
df.loc[3] = np.nan
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,
3,,,,


In [12]:
## drop row that all row contain nan
df.dropna(how = 'all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [21]:
df.dropna(thresh=1)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [22]:
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [23]:
df.dropna(subset=['A'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


# Impute

In [28]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp

In [35]:
re = pd.DataFrame(imp.fit_transform(df), columns = ['A', 'B', 'C', 'D'])
re.round(2)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0
3,5.33,6.33,7.5,6.0
