In [1]:
import pandas as pd
import numpy as np

### Handling missing values

In [2]:
df = pd.DataFrame({'some_column': [1, np.nan, np.nan, 2, 3, 1, 3, np.nan, 3], 
                   'another_column': [10, np.nan, np.nan, 20, 30, 10, 30, np.nan, 30], 
                   'target': ['A','A', 'B','B','B','B', 'C','C','C']})

In [3]:
df

Unnamed: 0,some_column,another_column,target
0,1.0,10.0,A
1,,,A
2,,,B
3,2.0,20.0,B
4,3.0,30.0,B
5,1.0,10.0,B
6,3.0,30.0,C
7,,,C
8,3.0,30.0,C


### 1. Drop all missing values

In [5]:
df.dropna()

Unnamed: 0,some_column,another_column,target
0,1.0,10.0,A
3,2.0,20.0,B
4,3.0,30.0,B
5,1.0,10.0,B
6,3.0,30.0,C
8,3.0,30.0,C


we would lose one third of the data 

### 2. Fill NAs with certain values (static)

In [6]:
df.fillna(value=99)

Unnamed: 0,some_column,another_column,target
0,1.0,10.0,A
1,99.0,99.0,A
2,99.0,99.0,B
3,2.0,20.0,B
4,3.0,30.0,B
5,1.0,10.0,B
6,3.0,30.0,C
7,99.0,99.0,C
8,3.0,30.0,C


### 3. We can use some average to fill the NAs

In [9]:
#for indiviual columns
df['some_column'].fillna(value=df['some_column'].mean(), inplace=True)

In [10]:
df

Unnamed: 0,some_column,another_column,target
0,1.0,10.0,A
1,2.166667,,A
2,2.166667,,B
3,2.0,20.0,B
4,3.0,30.0,B
5,1.0,10.0,B
6,3.0,30.0,C
7,2.166667,,C
8,3.0,30.0,C


for the entire dataframe

In [11]:
df = pd.DataFrame({'some_column': [1, np.nan, np.nan, 2, 3, 1, 3, np.nan, 3], 
                   'another_column': [10, np.nan, np.nan, 20, 30, 10, 30, np.nan, 30], 
                   'target': ['A','A', 'B','B','B','B', 'C','C','C']})

In [12]:
df.fillna(value = df.mean())

Unnamed: 0,some_column,another_column,target
0,1.0,10.0,A
1,2.166667,21.666667,A
2,2.166667,21.666667,B
3,2.0,20.0,B
4,3.0,30.0,B
5,1.0,10.0,B
6,3.0,30.0,C
7,2.166667,21.666667,C
8,3.0,30.0,C


### 4. using an average according to the target variable

In [13]:
df.groupby('target').fillna(value=df.mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,some_column,another_column
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0,1.0,10.0
A,1,2.166667,21.666667
B,2,2.166667,21.666667
B,3,2.0,20.0
B,4,3.0,30.0
B,5,1.0,10.0
C,6,3.0,30.0
C,7,2.166667,21.666667
C,8,3.0,30.0


does not work yet

In [15]:
df.groupby('target').apply(lambda x: x.fillna(x.mean()))

Unnamed: 0_level_0,Unnamed: 1_level_0,some_column,another_column,target
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,0,1.0,10.0,A
A,1,1.0,10.0,A
B,2,2.0,20.0,B
B,3,2.0,20.0,B
B,4,3.0,30.0,B
B,5,1.0,10.0,B
C,6,3.0,30.0,C
C,7,3.0,30.0,C
C,8,3.0,30.0,C


In [16]:
df.groupby('target').transform(lambda x: x.fillna(x.mean()))

Unnamed: 0,some_column,another_column
0,1.0,10.0
1,1.0,10.0
2,2.0,20.0
3,2.0,20.0
4,3.0,30.0
5,1.0,10.0
6,3.0,30.0
7,3.0,30.0
8,3.0,30.0


In [17]:
df[['some_column', 'another_column']] = df.groupby('target').transform(lambda x: x.fillna(x.mean()))

In [18]:
df

Unnamed: 0,some_column,another_column,target
0,1.0,10.0,A
1,1.0,10.0,A
2,2.0,20.0,B
3,2.0,20.0,B
4,3.0,30.0,B
5,1.0,10.0,B
6,3.0,30.0,C
7,3.0,30.0,C
8,3.0,30.0,C


transform returns a Series which has the same length as the input (calculations are the same as with apply)

### 5. replace missing with the last available value (e. g. timeseries)

In [20]:
df = pd.DataFrame({'some_column': [1, np.nan, np.nan, 2, 3, 1, 3, np.nan, 3],
'another_column': [10, np.nan, np.nan, 20, 30, 10, 30, np.nan, 30],
'target': ['A','A', 'B','B','B','B', 'C','C','C']})

In [22]:
df.fillna(method='ffill')

Unnamed: 0,some_column,another_column,target
0,1.0,10.0,A
1,1.0,10.0,A
2,1.0,10.0,B
3,2.0,20.0,B
4,3.0,30.0,B
5,1.0,10.0,B
6,3.0,30.0,C
7,3.0,30.0,C
8,3.0,30.0,C
