![title](Header__0009_1.png "Header")
___
# Chapter 1 - Data Munging Basics
## Segment 2 - Treating missing values

In [10]:
import numpy as np
import pandas as pd 

from pandas import Series, DataFrame

### Figuring out what data is missing

In [11]:
missing = np.nan

series_obj = Series(['row 1', 'row 2', missing, 'row 4','row 5', 'row 6', missing, 'row 8'])
series_obj

0    row 1
1    row 2
2      NaN
3    row 4
4    row 5
5    row 6
6      NaN
7    row 8
dtype: object

In [12]:
# object_name.isnull()
# ♔┈♔┈♔┈( WHAT THIS DOES )┈♔┈♔┈♔
# The .isnull() method returns a Boolean value that describes (True or False) whether an element in a 
# Pandas object is a null value.
series_obj.isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool

### Filling in for missing values

In [13]:
np.random.seed(25)
DF_obj = DataFrame(np.random.randn(36).reshape(6,6))
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.232312
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-1.049942
3,1.05661,-0.419678,2.294842,-2.594487,2.822756,0.680889
4,-1.577693,-1.976254,0.53334,-0.29087,-0.51352,1.982626
5,0.226001,-1.839905,1.607671,0.388292,0.399732,0.405477


In [14]:
DF_obj.ix[3:5, 0] = missing
DF_obj.ix[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,
3,,-0.419678,2.294842,-2.594487,2.822756,
4,,-1.976254,0.53334,-0.29087,-0.51352,
5,,-1.839905,1.607671,0.388292,0.399732,0.405477


In [15]:
# object_name.fillna(numeric value)
# ♔┈♔┈♔┈( WHAT THIS DOES )┈♔┈♔┈♔
# The .fillna method() finds each missing value from within a Pandas object and fills it with the 
# numeric value that you've passed in.
filled_DF = DF_obj.fillna(0)
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,0.0
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,0.0
3,0.0,-0.419678,2.294842,-2.594487,2.822756,0.0
4,0.0,-1.976254,0.53334,-0.29087,-0.51352,0.0
5,0.0,-1.839905,1.607671,0.388292,0.399732,0.405477


In [17]:
# object_name.fillna(dict)
# ♔┈♔┈♔┈( WHAT THIS DOES )┈♔┈♔┈♔
# You can pass a dictionary into the .fillna() method. The method will then fill in missing values 
# from each column Series (as designated by the dictionary key) with its own unique value 
# (as specified in the corresponding dictionary value).
filled_DF = DF_obj.fillna({0: 0.1, 5: 1.25})
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,1.25
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,1.25
3,0.1,-0.419678,2.294842,-2.594487,2.822756,1.25
4,0.1,-1.976254,0.53334,-0.29087,-0.51352,1.25
5,0.1,-1.839905,1.607671,0.388292,0.399732,0.405477


In [18]:
# ♔┈♔┈♔┈( WHAT THIS DOES )┈♔┈♔┈♔
# You can also pass in the method='ffill' arguement, and the .fillna() method will fill-forward any 
# missing values with values from the last non-null element in the column Series.
fill_DF = DF_obj.fillna(method='ffill')
fill_DF

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.222326
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-0.222326
3,2.152957,-0.419678,2.294842,-2.594487,2.822756,-0.222326
4,2.152957,-1.976254,0.53334,-0.29087,-0.51352,-0.222326
5,2.152957,-1.839905,1.607671,0.388292,0.399732,0.405477


### Counting missing values

In [21]:
np.random.seed(25)
DF_obj = DataFrame(np.random.randn(36).reshape(6,6))
DF_obj.ix[3:5, 0] = missing
DF_obj.ix[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,
3,,-0.419678,2.294842,-2.594487,2.822756,
4,,-1.976254,0.53334,-0.29087,-0.51352,
5,,-1.839905,1.607671,0.388292,0.399732,0.405477


In [22]:
# object_name.isnull().sum()
# ♔┈♔┈♔┈( WHAT THIS DOES )┈♔┈♔┈♔
# To generate a count of how many missing values a DataFrame has per column, just call the .isnull() 
# method off of the object, and then call the .sum() method off of the matrix of Boolean values it 
# returns.
DF_obj.isnull().sum()

0    3
1    0
2    0
3    0
4    0
5    4
dtype: int64

### Filtering out missing values

In [30]:
# object_name.dropna()
# ♔┈♔┈♔┈( WHAT THIS DOES )┈♔┈♔┈♔
# To identify and drop all rows from a DataFrame that contain ANY missing values, simply call the 
# .dropna() method off of the DataFrame object. NOTE: If you wanted to drop columns that contain 
# any missing values, you'd just pass in the axis=1 argument to select and search the DataFrame 
# by columns, instead of by row.
DF_no_NaN = DF_obj.dropna(axis=1)
DF_no_NaN

Unnamed: 0,1,2,3,4
0,1.02689,-0.839585,-0.591182,-0.956888
1,1.837905,-2.053231,0.868583,-0.920734
2,-1.334661,0.07638,-1.246089,1.202272
3,-0.419678,2.294842,-2.594487,2.822756
4,-1.976254,0.53334,-0.29087,-0.51352
5,-1.839905,1.607671,0.388292,0.399732


In [31]:
# object_name.dropna(how='all')
# ♔┈♔┈♔┈( WHAT THIS DOES )┈♔┈♔┈♔
# To identify and drop only the rows from a DataFrame that contain ALL missing values, simply 
# call the .dropna() method off of the DataFrame object, and pass in the how='all' argument.
DF_obj.dropna(how='all')

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,
3,,-0.419678,2.294842,-2.594487,2.822756,
4,,-1.976254,0.53334,-0.29087,-0.51352,
5,,-1.839905,1.607671,0.388292,0.399732,0.405477
