In [1]:
# 02-Missing-Data-01

In [4]:
# Create dataframe with missing values
import numpy as np
import pandas as pd

data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'], 
             'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'], 
             'age': [42, np.nan, 36, 24, 73], 
             'sex': ['m', np.nan, 'f', 'm', 'f'], 
             'preTestScore': [4, np.nan, np.nan, 2, 3],
             'postTestScore': [25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df


Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [6]:
# Drop missing observations

df_miss = df.dropna()
df_miss

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [8]:
# Drop rows where all cells in that row is NA

df_miss_allcells=df.dropna(how='all')
df_miss_allcells

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [9]:
# Create a new column full of missing values
df['location']=np.nan
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [10]:
# Drop column if they only contain missing values
# This is really mostly useful for time series
# thresh parameters lets you specify a minimum number of non-null values for the 
# row/column to ramin.

df.dropna(thresh=5)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [13]:
# Fill in missing data with zeros

df.fillna(0)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,0.0
1,0,0,0.0,0,0.0,0.0,0.0
2,Tina,Ali,36.0,f,0.0,0.0,0.0
3,Jake,Milner,24.0,m,2.0,62.0,0.0
4,Amy,Cooze,73.0,f,3.0,70.0,0.0


In [14]:
# Fill in missing in preTestScore with the mean value of preTestScore

# inplace = True means that the changes are saved to df right away

df['preTestScore'].fillna(df['preTestScore'].mean(),inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [15]:
# Fill in missing in postTestScore with each sex's mean value of postTestScore
df['postTestScore'].fillna(df.groupby('sex')['postTestScore'].transform('mean'),inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,3.0,,
2,Tina,Ali,36.0,f,3.0,70.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [19]:
# Select some rows but ignore the missing data points
df_1 = df[df['age'].notnull()&df['sex'].notnull()]
df_1

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
2,Tina,Ali,36.0,f,3.0,70.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,
