### Dealing with missing data

In [2]:
import pandas as pd
import numpy as np

#### Verifying if a value is null

In [None]:
pd.isnull(np.nan), pd.isnull(None)

In [None]:
pd.isna(np.nan), pd.isna(None)

In [14]:
pd.isnull('a'), pd.isna('a')

(False, False)

#### Verifying if a value is not null

In [None]:
pd.notnull(np.nan), pd.notnull(None)

In [None]:
pd.notna(np.nan), pd.notna(None)

In [None]:
pd.notnull('a'), pd.notna('a')

These functions also work with Series and DataFrames

In [None]:
pd.isnull(pd.Series([10, np.nan, 1])), pd.isna(pd.Series([10, np.nan, 1]))

In [None]:
pd.isnull(
    pd.DataFrame({
        'A': [1, None, 'a'], 
        'B': [np.nan, 0.3, 'b'], 
        'C': [3, None, np.nan]
    }, index=['W', 'X', 'Y'])
)

In [None]:
pd.isna(
    pd.DataFrame({
        'A': [1, None, 'a'], 
        'B': [np.nan, 0.3, 'b'], 
        'C': [3, None, np.nan]
    }, index=['W', 'X', 'Y'])
)

#### Pandas operations with missing data

Counting the non-null values

In [10]:
pd.Series([1, 2, np.nan]).count()

2

Summing the non-null values in the series

In [11]:
pd.Series([1, 2, None]).sum()

3.0

Mean of the non-null values in the series (sum of non-null values / number of non-null values)

In [14]:
pd.Series([1, 2, np.nan]).mean()

1.5

Getting the number of non-null values through sum()

In [17]:
s = pd.Series(['a', 3, np.nan, 1, np.nan, None, 'abc', 0.8, ''])

print(s.notnull().sum())

6


#### Filtering missing data

In [28]:
s = pd.Series(['a', 3, np.nan, 1, np.nan, None, 'abc', 0.8, '#'])

In [None]:
s[pd.notnull(s)], s[pd.notna(s)]

In [None]:
s[pd.isnull(s)], s[pd.isna(s)]

#### Dropping null values

In [None]:
s.dropna()

You can also drop values from DataFrames

In [41]:
df = pd.DataFrame({
    'Column A': [1, np.nan, 30, np.nan],
    'Column B': [2, 8, 31, np.nan],
    'Column C': [np.nan, 9, 32, 100],
    'Column D': [5, 8, 34, 110],
})
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [None]:
df.isnull()

Seeing how many null values we have per column

In [None]:
df.isnull().sum()

All the rows with null values will be dropped if we use dropna()

In [None]:
df.dropna()

If we want to drop columns with null values, we add the argument axis=1 or axis='columns'

In [None]:
df.dropna(axis=1)

If you just want to drop rows where all the values are null, you can use the parameter how='all'

In [39]:
df.dropna(how='all')

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


If you use how='any', it will have the same effect as the dropna(), since how='any' is the default of dropna()

In [40]:
df.dropna(how='any')

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


If you want to have a minimum number of non-null values in a row, use thresh=number

In [44]:
df.dropna(thresh=3)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34


If you want to have a minimum number of non-null values in a column, use thresh=number and axis=1 or axis='column'

In [45]:
df.dropna(thresh=3, axis=1)

Unnamed: 0,Column B,Column C,Column D
0,2.0,,5
1,8.0,9.0,8
2,31.0,32.0,34
3,,100.0,110
