# Deal with missing values

In [1]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

### Generate Series with missing values

In [2]:
missing = np.nan

series = Series(['row 1', 'row 2', missing, 'row 4', 'row 5'])
print(series)

0    row 1
1    row 2
2      NaN
3    row 4
4    row 5
dtype: object


### Find missing values by comparing Series

In [3]:
compare_null = series.isnull()
print(compare_null)

0    False
1    False
2     True
3    False
4    False
dtype: bool


### Generate DataFrame with missing values and fill them in

In [4]:
np.random.seed(25)

df = DataFrame(np.random.rand(36).reshape(6,6), index=['row 1', 'row 2', 'row 3', 'row 4', 'row 5', 'row 6'], columns=['col 1', 'col 2', 'col 3', 'col 4', 'col 5', 'col 6'])

print(df)

          col 1     col 2     col 3     col 4     col 5     col 6
row 1  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
row 2  0.684969  0.437611  0.556229  0.367080  0.402366  0.113041
row 3  0.447031  0.585445  0.161985  0.520719  0.326051  0.699186
row 4  0.366395  0.836375  0.481343  0.516502  0.383048  0.997541
row 5  0.514244  0.559053  0.034450  0.719930  0.421004  0.436935
row 6  0.281701  0.900274  0.669612  0.456069  0.289804  0.525819


### Replace some cells with missing values

In [5]:
df.loc['row 2':'row 5', 'col 3'] = missing
df.loc['row 3':'row 6', 'col 6'] = missing
print(df)

          col 1     col 2     col 3     col 4     col 5     col 6
row 1  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
row 2  0.684969  0.437611       NaN  0.367080  0.402366  0.113041
row 3  0.447031  0.585445       NaN  0.520719  0.326051       NaN
row 4  0.366395  0.836375       NaN  0.516502  0.383048       NaN
row 5  0.514244  0.559053       NaN  0.719930  0.421004       NaN
row 6  0.281701  0.900274  0.669612  0.456069  0.289804       NaN


### Fill missing values (NaN) with value 0

In [6]:
filled_df = df.fillna(0)
print(filled_df)

          col 1     col 2     col 3     col 4     col 5     col 6
row 1  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
row 2  0.684969  0.437611  0.000000  0.367080  0.402366  0.113041
row 3  0.447031  0.585445  0.000000  0.520719  0.326051  0.000000
row 4  0.366395  0.836375  0.000000  0.516502  0.383048  0.000000
row 5  0.514244  0.559053  0.000000  0.719930  0.421004  0.000000
row 6  0.281701  0.900274  0.669612  0.456069  0.289804  0.000000


### Fill missing values (NaN) with specific values for every column

In [7]:
filled_df = df.fillna({ 'col 3' : 0.25, 'col 6' : 0.33 })
print(filled_df)

          col 1     col 2     col 3     col 4     col 5     col 6
row 1  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
row 2  0.684969  0.437611  0.250000  0.367080  0.402366  0.113041
row 3  0.447031  0.585445  0.250000  0.520719  0.326051  0.330000
row 4  0.366395  0.836375  0.250000  0.516502  0.383048  0.330000
row 5  0.514244  0.559053  0.250000  0.719930  0.421004  0.330000
row 6  0.281701  0.900274  0.669612  0.456069  0.289804  0.330000


### Fill missing values (NaN) with the results from a function (method)

In [8]:
forward_filled_df = df.fillna(method='ffill')
print(forward_filled_df)

          col 1     col 2     col 3     col 4     col 5     col 6
row 1  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
row 2  0.684969  0.437611  0.278839  0.367080  0.402366  0.113041
row 3  0.447031  0.585445  0.278839  0.520719  0.326051  0.113041
row 4  0.366395  0.836375  0.278839  0.516502  0.383048  0.113041
row 5  0.514244  0.559053  0.278839  0.719930  0.421004  0.113041
row 6  0.281701  0.900274  0.669612  0.456069  0.289804  0.113041


### Count missing values (NaN)

In [9]:
count_missing = df.isnull().sum()
print(count_missing)

total_missing = df.isnull().sum().sum()
print('Total number of missing values is ' + str(total_missing))

total_missing_short = count_missing.sum()
print('Total missing ' + str(total_missing_short))

col 1    0
col 2    0
col 3    4
col 4    0
col 5    0
col 6    4
dtype: int64
Total number of missing values is 8
Total missing 8


### Drop rows that contain missing values (NaN)

In [10]:
no_null_df = df.dropna()
print(no_null_df)

          col 1     col 2     col 3     col 4   col 5     col 6
row 1  0.870124  0.582277  0.278839  0.185911  0.4111  0.117376


### Drop cols that contain missing values (NaN)

In [11]:
no_null_df = df.dropna(axis=1)
print(no_null_df)

          col 1     col 2     col 4     col 5
row 1  0.870124  0.582277  0.185911  0.411100
row 2  0.684969  0.437611  0.367080  0.402366
row 3  0.447031  0.585445  0.520719  0.326051
row 4  0.366395  0.836375  0.516502  0.383048
row 5  0.514244  0.559053  0.719930  0.421004
row 6  0.281701  0.900274  0.456069  0.289804


### Drop rows that contain ALL missing values (NaN)

In [12]:
no_null_df = df.dropna(how='all')
print(no_null_df)

          col 1     col 2     col 3     col 4     col 5     col 6
row 1  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
row 2  0.684969  0.437611       NaN  0.367080  0.402366  0.113041
row 3  0.447031  0.585445       NaN  0.520719  0.326051       NaN
row 4  0.366395  0.836375       NaN  0.516502  0.383048       NaN
row 5  0.514244  0.559053       NaN  0.719930  0.421004       NaN
row 6  0.281701  0.900274  0.669612  0.456069  0.289804       NaN


In [13]:
df.loc['row 2', 'col 1':'col 6'] = missing
no_null_df = df.dropna(how='all')
print(no_null_df)

          col 1     col 2     col 3     col 4     col 5     col 6
row 1  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
row 3  0.447031  0.585445       NaN  0.520719  0.326051       NaN
row 4  0.366395  0.836375       NaN  0.516502  0.383048       NaN
row 5  0.514244  0.559053       NaN  0.719930  0.421004       NaN
row 6  0.281701  0.900274  0.669612  0.456069  0.289804       NaN
