# Missing Data

Let's show a few convenient methods to deal with Missing Data in pandas:

In [1]:
import numpy as np
import pandas as pd

In [15]:
df = pd.DataFrame({'A':[1,2,np.nan],    # Using dicts whith natural key:value pairs to generate df. For
                  'B':[5,np.nan,np.nan],  # list you would need two, one for COL headers and one for COL
                  'C':[1,2,3]}, index=['R1','R2','R3']) # values.

In [16]:
df.index.names = ['ID']

In [17]:
df

Unnamed: 0_level_0,A,B,C
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
R1,1.0,5.0,1
R2,2.0,,2
R3,,,3


In [18]:
df.dropna() # Drops any ROWS with one or more NaN values, because it's AXIS arg is defaulting to 0.

Unnamed: 0_level_0,A,B,C
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
R1,1.0,5.0,1


In [19]:
df.dropna(axis=1)   # Drops any COLS with one or more NaN values

Unnamed: 0_level_0,C
ID,Unnamed: 1_level_1
R1,1
R2,2
R3,3


In [23]:
df.dropna(thresh=2)   # Drops rows with less than 2 none-NaN values

Unnamed: 0_level_0,A,B,C
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
R1,1.0,5.0,1
R2,2.0,,2


In [24]:
df.dropna(axis=1, thresh=2) # Drops cols with less than 2 none-NaN values

Unnamed: 0_level_0,A,C
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
R1,1.0,1
R2,2.0,2
R3,,3


In [25]:
df.fillna(value='FILL VALUE')   # Fills in NaN values with the value specified in its value arg

Unnamed: 0_level_0,A,B,C
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
R1,1,5,1
R2,2,FILL VALUE,2
R3,FILL VALUE,FILL VALUE,3


In [31]:
df['A'].fillna(value=df['A'].mean())  # Fill NaN values in col A, with the mean of that col. and return it as a series.

ID
R1    1.0
R2    2.0
R3    1.5
Name: A, dtype: float64

# Great Job!