In [None]:
'''
Handling Missing Values

    - find missing values in a Series
    - find missing values in a DataFrame
    - drop missing values
    - fill in missing values
'''

In [1]:
# imports
import pandas as pd
import numpy as np

In [11]:
# turn off the missing value filter
pd.read_csv('./data/drinks.csv', na_filter=False)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,AS
1,Albania,89,132,54,4.9,EU
2,Algeria,25,0,14,0.7,AF
3,Andorra,245,138,312,12.4,EU
4,Angola,217,57,45,5.9,AF
5,Antigua & Barbuda,102,128,45,4.9,
6,Argentina,193,25,221,8.3,SA
7,Armenia,21,179,11,3.8,EU
8,Australia,261,72,212,10.4,OC
9,Austria,279,75,191,9.7,EU


In [3]:
# keep the missing values (for demonstration purposes)
drinks = pd.read_csv('./data/drinks.csv')

In [4]:
# set more values to NaN (for demonstration purposes)
drinks.loc[192, 'beer_servings':'wine_servings'] = np.nan

In [9]:
drinks.shape

(193, 6)

In [12]:
# missing values are often just excluded
drinks.describe(include='all')              # excludes missing values
drinks.continent.value_counts(dropna=False) # includes missing values (new in pandas 0.14.1)

AF     53
EU     45
AS     44
NaN    23
OC     16
SA     12
Name: continent, dtype: int64

In [13]:
drinks.describe(include='all')

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
count,193,192.0,192.0,192.0,193.0,170
unique,193,,,,,5
top,Mexico,,,,,AF
freq,1,,,,,53
mean,,106.380208,81.322917,49.6875,4.717098,
std,,101.361394,88.397069,79.8379,3.773298,
min,,0.0,0.0,0.0,0.0,
25%,,20.0,4.0,1.0,1.3,
50%,,76.0,56.5,8.5,4.2,
75%,,189.0,128.75,59.75,7.2,


In [19]:
# find missing values in a Series
drinks.continent.isnull()           # True if NaN, False otherwise
drinks.continent.notnull()          # False if NaN, True otherwise
drinks[drinks.continent.notnull()]  # only show rows where continent is not NaN
drinks.continent.isnull().sum()     # count the missing values

23

In [20]:
# find missing values in a DataFrame
drinks.isnull()             # DataFrame of booleans
drinks.isnull().sum()       # calculate the sum of each column



country                          0
beer_servings                    1
spirit_servings                  1
wine_servings                    1
total_litres_of_pure_alcohol     0
continent                       23
dtype: int64

In [None]:
# drop missing values
drinks.dropna()             # drop a row if ANY values are missing
drinks.dropna(how='all')    # drop a row only if ALL values are missing


In [21]:
# fill in missing values
drinks.continent.fillna(value='NA')                 # does not modify 'drinks'
drinks.continent.fillna(value='NA', inplace=True)   # modifies 'drinks' in-place
drinks.fillna(drinks.mean())                        # fill in missing values using mean

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0.000000,0.000000,0.0000,0.0,AS
1,Albania,89.000000,132.000000,54.0000,4.9,EU
2,Algeria,25.000000,0.000000,14.0000,0.7,AF
3,Andorra,245.000000,138.000000,312.0000,12.4,EU
4,Angola,217.000000,57.000000,45.0000,5.9,AF
5,Antigua & Barbuda,102.000000,128.000000,45.0000,4.9,
6,Argentina,193.000000,25.000000,221.0000,8.3,SA
7,Armenia,21.000000,179.000000,11.0000,3.8,EU
8,Australia,261.000000,72.000000,212.0000,10.4,OC
9,Austria,279.000000,75.000000,191.0000,9.7,EU
