In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Create Example Dataset

In [3]:
country = ['Taiwan','Australia','Ireland','Australia','Ireland','Taiwan']
age = [25, 30, np.nan, np.nan, 22, 36]
salary = [20000, np.nan, 59000, np.nan, 43000, 52000]
dic = {'Country':country,'Age':age,'Salary':salary}
df = pd.DataFrame(dic)
df1 = df.copy()
df1

Unnamed: 0,Country,Age,Salary
0,Taiwan,25.0,20000.0
1,Australia,30.0,
2,Ireland,,59000.0
3,Australia,,
4,Ireland,22.0,43000.0
5,Taiwan,36.0,52000.0


## Count Missing Values

In [11]:
df.isnull()

Unnamed: 0,Country,Age,Salary
0,False,False,False
1,False,False,True
2,False,True,False
3,False,True,True
4,False,False,False
5,False,False,False


In [12]:
df.isnull().sum()

Country    0
Age        2
Salary     2
dtype: int64

In [13]:
# df.count() won't just count Null values
df.isnull().count()

Country    6
Age        6
Salary     6
dtype: int64

In [14]:
# count percentage of missing values
missing_cnt = df.isnull().sum()
total_cnt = df.isnull().count()

missing_cnt / total_cnt

Country    0.000000
Age        0.333333
Salary     0.333333
dtype: float64

## Deal with Missing Values

### by `sklearn.impute.SimpleImputer`
- Can use on `numpy array`, `DataFrame`
- Can only use these strategies: ['mean', 'median', 'most_frequent', 'constant']

#### DataFrame

In [4]:
# DataFrame
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(df1.iloc[:, 1:3])
df1.iloc[:, 1:3] = imputer.transform(df1.iloc[:, 1:3])

In [5]:
df1

Unnamed: 0,Country,Age,Salary
0,Taiwan,25.0,20000.0
1,Australia,30.0,43500.0
2,Ireland,28.25,59000.0
3,Australia,28.25,43500.0
4,Ireland,22.0,43000.0
5,Taiwan,36.0,52000.0


#### numpy array

In [6]:
# numpy array
df_array = df.values
df_array

array([['Taiwan', 25.0, 20000.0],
       ['Australia', 30.0, nan],
       ['Ireland', nan, 59000.0],
       ['Australia', nan, nan],
       ['Ireland', 22.0, 43000.0],
       ['Taiwan', 36.0, 52000.0]], dtype=object)

In [8]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(df_array[:, 1:3])
df_array[:, 1:3] = imputer.transform(df_array[:, 1:3])

In [9]:
df_array

array([['Taiwan', 25.0, 20000.0],
       ['Australia', 30.0, 43500.0],
       ['Ireland', 28.25, 59000.0],
       ['Australia', 28.25, 43500.0],
       ['Ireland', 22.0, 43000.0],
       ['Taiwan', 36.0, 52000.0]], dtype=object)

### by `fillna`

#### replace all NAN with a number

In [16]:
df2 = df.copy()
df2

Unnamed: 0,Country,Age,Salary
0,Taiwan,25.0,20000.0
1,Australia,30.0,
2,Ireland,,59000.0
3,Australia,,
4,Ireland,22.0,43000.0
5,Taiwan,36.0,52000.0


In [17]:
# fill all nan as 0
df2.fillna(0)

Unnamed: 0,Country,Age,Salary
0,Taiwan,25.0,20000.0
1,Australia,30.0,0.0
2,Ireland,0.0,59000.0
3,Australia,0.0,0.0
4,Ireland,22.0,43000.0
5,Taiwan,36.0,52000.0


In [18]:
df3 = df.copy()

# replace all NA's the value that comes directly after it in the same column, 
# then replace all the remaining na's with 0
df3.fillna(method='bfill', axis=0).fillna(0)

Unnamed: 0,Country,Age,Salary
0,Taiwan,25.0,20000.0
1,Australia,30.0,59000.0
2,Ireland,22.0,59000.0
3,Australia,22.0,43000.0
4,Ireland,22.0,43000.0
5,Taiwan,36.0,52000.0
