In [2]:
import pandas as pd
import numpy as np

raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy', np.nan], 
        'last_name': ['Miller', 'Jacobson', np.nan, 'Milner', 'Cooze', np.nan], 
        'age': [42, 52, 36, 24, 73, np.nan], 
        'preTestScore': [4, 24, 31, np.nan, np.nan, np.nan],
        'postTestScore': ["25,000", "94,000", 57, 62, 70, np.nan],
           np.nan:np.nan}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore', np.nan])

In [3]:
df

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore,NaN
0,Jason,Miller,42.0,4.0,25000.0,
1,Molly,Jacobson,52.0,24.0,94000.0,
2,Tina,,36.0,31.0,57.0,
3,Jake,Milner,24.0,,62.0,
4,Amy,Cooze,73.0,,70.0,
5,,,,,,


In [4]:
df.notnull()

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore,NaN
0,True,True,True,True,True,False
1,True,True,True,True,True,False
2,True,False,True,True,True,False
3,True,True,True,False,True,False
4,True,True,True,False,True,False
5,False,False,False,False,False,False


In [6]:
df[df['preTestScore'].notnull()]

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore,NaN
0,Jason,Miller,42.0,4.0,25000,
1,Molly,Jacobson,52.0,24.0,94000,
2,Tina,,36.0,31.0,57,


In [3]:
#Drops all rows that have an nan
df.dropna()

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore,NaN


In [6]:
#Drops rows that are all Nan
df.dropna(how='all')

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore,NaN
0,Jason,Miller,42.0,4.0,25000,
1,Molly,Jacobson,52.0,24.0,94000,
2,Tina,,36.0,31.0,57,
3,Jake,Milner,24.0,,62,
4,Amy,Cooze,73.0,,70,


In [8]:
cleaned= df

#Inplace option does incplace operations rather than creating new table
cleaned.dropna(how='all', inplace=True)
cleaned

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore,NaN
0,Jason,Miller,42.0,4.0,25000,
1,Molly,Jacobson,52.0,24.0,94000,
2,Tina,,36.0,31.0,57,
3,Jake,Milner,24.0,,62,
4,Amy,Cooze,73.0,,70,


In [9]:
#Fill defined column with default value if nan
cleaned.fillna({'preTestScore': 20, 'last_name': 'Doe'}, inplace=True)
cleaned

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore,NaN
0,Jason,Miller,42.0,4.0,25000,
1,Molly,Jacobson,52.0,24.0,94000,
2,Tina,Doe,36.0,31.0,57,
3,Jake,Milner,24.0,20.0,62,
4,Amy,Cooze,73.0,20.0,70,


In [10]:
#Drop a whole column of Nan values that are nan by providing axis
cleaned.dropna(axis=1, how='all')


Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore
0,Jason,Miller,42.0,4.0,25000
1,Molly,Jacobson,52.0,24.0,94000
2,Tina,Doe,36.0,31.0,57
3,Jake,Milner,24.0,20.0,62
4,Amy,Cooze,73.0,20.0,70


In [11]:
dirty_df = df
dirty_df.assign(column1=np.nan)
#Put nan into row 4, column first-name
dirty_df.at[4, 'first_name']=np.nan
dirty_df

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore,NaN
0,Jason,Miller,42.0,4.0,25000,
1,Molly,Jacobson,52.0,24.0,94000,
2,Tina,Doe,36.0,31.0,57,
3,Jake,Milner,24.0,20.0,62,
4,,Cooze,73.0,20.0,70,


In [12]:
#Drop rows that have less than 5 real (non-nan) values
dirty_df.dropna(thresh=5)

Unnamed: 0,first_name,last_name,age,preTestScore,postTestScore,NaN
0,Jason,Miller,42.0,4.0,25000,
1,Molly,Jacobson,52.0,24.0,94000,
2,Tina,Doe,36.0,31.0,57,
3,Jake,Milner,24.0,20.0,62,
