In [1]:
import numpy as np
import pandas as pd

## Missing data has only 3 options
- Keep the missing data (NaN) if the forecasting method can handle it
- Drop the missing data (The entire row including the timestamp)
- Fill the missing data with some value (best estimated guess)

In [2]:
df = pd.DataFrame({'A':[1,2,np.nan],'B':[4,np.nan,np.nan],'C':[7,8,9]})

In [3]:
df

Unnamed: 0,A,B,C
0,1.0,4.0,7
1,2.0,,8
2,,,9


### Drop NaN using dropna() functinon of pandas
    DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
    axis=0 -> Drop row
    axis=1 -> Drop column
    thresh= Number of NaN required in axis in order to delete them

In [4]:
# dropping any row with missing data

df.dropna() 

Unnamed: 0,A,B,C
0,1.0,4.0,7


In [5]:
# dropping the column with any missing data

df.dropna(axis=1) 

Unnamed: 0,C
0,7
1,8
2,9


In [6]:
# means drop any row with has atleast 2 NaN otherwise dont drop that row

df.dropna(thresh=2) 

Unnamed: 0,A,B,C
0,1.0,4.0,7
1,2.0,,8


In [7]:
# drop any column which has atleast 2 NaN

df.dropna(axis=1,thresh=2) 

Unnamed: 0,A,C
0,1.0,7
1,2.0,8
2,,9


In [8]:
# Drop NaN values based on specific columns

df.dropna(subset=['A', 'C']) 

Unnamed: 0,A,B,C
0,1.0,4.0,7
1,2.0,,8


## Filling the NaN
- DataFrame.fillna(value=None, method=None, axis=None, inplace=False, limit=None)
    - method: The method used for filling:
        - 'ffill': Forward fill (fills NaN with the last valid observation).
        - 'bfill': Backward fill (fills NaN with the next valid observation).

In [9]:
df

Unnamed: 0,A,B,C
0,1.0,4.0,7
1,2.0,,8
2,,,9


In [10]:
df.fillna(value=-100)

Unnamed: 0,A,B,C
0,1.0,4.0,7
1,2.0,-100.0,8
2,-100.0,-100.0,9


In [11]:
df.mean()

A    1.5
B    4.0
C    8.0
dtype: float64

In [12]:
# filling the NaN of each column by the average of that column

df.fillna(df.mean()) 

Unnamed: 0,A,B,C
0,1.0,4.0,7
1,2.0,4.0,8
2,1.5,4.0,9


In [13]:
# filling NaN of a particular column

df['B'].fillna(value=-200) 

0      4.0
1   -200.0
2   -200.0
Name: B, dtype: float64

In [14]:
# filling NaN of multiple columns


df[['A','B']].fillna(value=-500) 

Unnamed: 0,A,B
0,1.0,4.0
1,2.0,-500.0
2,-500.0,-500.0


In [15]:
df

Unnamed: 0,A,B,C
0,1.0,4.0,7
1,2.0,,8
2,,,9
