In [1]:
# dependencies
import datetime as dt
import numpy as np
import pandas as pd

In [2]:
# support methods
def pretty_print(label, item):
    print('{:20}{}'.format(label, item))

## checking for missingness in python

In [3]:
# Note that while "np.nan" is an accepted version of "np.NaN",
# "pd.nat" is not accepted as "pd.NaT"
nulls = [None, np.nan, np.datetime64('NaT'), pd.NaT]
test_cases = [1, None, 2, np.nan, 3, np.datetime64('NaT'), 4, pd.NaT, 5]

What are the available null values and corresponding types?

In [4]:
for val in nulls:
    print(f'{val} is {type(val)}')

None is <class 'NoneType'>
nan is <class 'float'>
NaT is <class 'numpy.datetime64'>
NaT is <class 'pandas._libs.tslibs.nattype.NaTType'>


In [5]:
# pure python approach
print('By Boolean')
print('==========')
for val in test_cases:
    if val:
        pretty_print('type:', type(val))
        pretty_print('val:', val)
        print()

print('\nBy using !=')
print('===========')
for val in test_cases:
    if (val != None) & (val != np.nan) & (val != np.datetime64('NaT')) & (val != pd.NaT):
        pretty_print('type:', type(val))
        pretty_print('val:', val)
        print()

print('\nBy using ==')
print('===========')
for val in test_cases:
    if not ((val == None) | (val == np.nan) | (val == np.datetime64('NaT')) | (val == pd.NaT)):
        pretty_print('type:', type(val))
        pretty_print('val:', val)
        print()

print('\nBy using is in')
print('==============')
for val in test_cases:
    if val not in nulls:
        pretty_print('type:', type(val))
        pretty_print('val:', val)
        print()

By Boolean
type:               <class 'int'>
val:                1

type:               <class 'int'>
val:                2

type:               <class 'float'>
val:                nan

type:               <class 'int'>
val:                3

type:               <class 'numpy.datetime64'>
val:                NaT

type:               <class 'int'>
val:                4

type:               <class 'pandas._libs.tslibs.nattype.NaTType'>
val:                NaT

type:               <class 'int'>
val:                5


By using !=
type:               <class 'int'>
val:                1

type:               <class 'int'>
val:                2

type:               <class 'float'>
val:                nan

type:               <class 'int'>
val:                3

type:               <class 'numpy.datetime64'>
val:                NaT

type:               <class 'int'>
val:                4

type:               <class 'pandas._libs.tslibs.nattype.NaTType'>
val:                NaT

type:          

  if (val != None) & (val != np.nan) & (val != np.datetime64('NaT')) & (val != pd.NaT):
  if not ((val == None) | (val == np.nan) | (val == np.datetime64('NaT')) | (val == pd.NaT)):
  if val not in nulls:


**Takeaway:** 

Okay! So python can catch its own NoneType value with standard library code, and has mixed results with som soon-to-be deprecated approaches. Hmmm...

In [6]:
# Why did so many of the != and == checks fail?
# Matching nulls
print('None == None:\t\t\t\t\t', None == None)
print('np.nan == np.nan:\t\t\t\t', np.nan == np.nan)
print('np.datetime64(NaT) == np.datetime64(NaT):\t', np.datetime64('NaT') == np.datetime64('NaT'))
print('pd.NaT == pd.NaT:\t\t\t\t', pd.NaT == pd.NaT)

None == None:					 True
np.nan == np.nan:				 False
np.datetime64(NaT) == np.datetime64(NaT):	 False
pd.NaT == pd.NaT:				 False


**Takeaway:** 

Weird. Moving on to numpy methods!

In [7]:
# numpy approach
print('By using np.isnan()')
print('===================')
for val in test_cases:
    try:
        if not np.isnan(val):
            print(val)
    except Exception:
        print(f'{val} of type {type(val)} raised a {Exception.__class__.__name__.title()}Error')
        continue

print('\nBy combining pure python and np.isnan()')
print('=========================================')
for val in test_cases:
    try:
        if (val) and (not np.isnan(val)):
            print(val)
    except Exception:
        print(f'{val} of type {type(val)} raised a {Exception.__class__.__name__.title()}Error')
        continue

By using np.isnan()
1
None of type <class 'NoneType'> raised a TypeError
2
3
4
NaT of type <class 'pandas._libs.tslibs.nattype.NaTType'> raised a TypeError
5

By combining pure python and np.isnan()
1
2
3
4
NaT of type <class 'pandas._libs.tslibs.nattype.NaTType'> raised a TypeError
5


**Takeaway:** 

Okay! So numpy can flag numpy's own null values and nothing else, but it pairs well with a pure python check.

In [8]:
# pandas approach
print('By using pd.isnull()')
print('====================')
for val in test_cases:
    if not (pd.isnull(val)):
        print(val)

By using pd.isnull()
1
2
3
4
5


**Takeaway:** 

Nice! `pd.isnull()` seems to cover all the bases in one move. Does it work the same with `df.isnull()` and `df.col.isnull()`?

In [9]:
null_df = pd.DataFrame(nulls, columns=['val'])
print(nulls)
null_df

[None, nan, numpy.datetime64('NaT'), NaT]


Unnamed: 0,val
0,NaT
1,NaT
2,NaT
3,NaT


**Takeaway:**

Uhhhh. Why.

In [10]:
none_df = pd.DataFrame([None], columns=['val_none'])
none_df

Unnamed: 0,val_none
0,


In [11]:
nan_df = pd.DataFrame([np.nan], columns=['val_nan'])
nan_df

Unnamed: 0,val_nan
0,


In [12]:
np_nat_df = pd.DataFrame([np.datetime64('NaT')], columns=['val_np_nat'])
np_nat_df

Unnamed: 0,val_np_nat
0,NaT


In [23]:
pd_nat_df = pd.DataFrame([pd.NaT], columns=['val_pd_nat'])
pd_nat_df

Unnamed: 0,val_pd_nat
0,NaT


**Takeaway:**

Okay, so pandas must be changing the sentinel values on the backend when more than one type is provided. There's probably documentation explaining the specific judgment call that executes the upcast. Is it an upcast?

In [25]:
# back to df.isna()
print('By using df.isna()')
print('====================')
print(f'Result is {type(none_df.isna())}\n')
print(none_df.isna(), '\n')
print(nan_df.isna(), '\n')
print(np_nat_df.isna(), '\n')
print(pd_nat_df.isna(), '\n\n')

# back to df.isnull()
print('By using df.isnull()')
print('====================')
print(f'Result is {type(none_df.isnull())}\n')
print(none_df.isnull(), '\n')
print(nan_df.isnull(), '\n')
print(np_nat_df.isnull(), '\n')
print(pd_nat_df.isnull(), '\n\n')

# df.col.isnull()
print('By using df.col.isnull()')
print('========================')
print(f'Result is {type(none_df.val_none.isnull())}\n')
print(none_df.val_none.isnull(), '\n')
print(nan_df.val_nan.isnull(), '\n')
print(np_nat_df.val_np_nat.isnull(), '\n')
print(pd_nat_df.val_pd_nat.isnull())

By using df.isna()
Result is <class 'pandas.core.frame.DataFrame'>

   val_none
0      True 

   val_nan
0     True 

   val_np_nat
0        True 

   val_pd_nat
0        True 


By using df.isnull()
Result is <class 'pandas.core.frame.DataFrame'>

   val_none
0      True 

   val_nan
0     True 

   val_np_nat
0        True 

   val_pd_nat
0        True 


By using df.col.isnull()
Result is <class 'pandas.core.series.Series'>

0    True
Name: val_none, dtype: bool 

0    True
Name: val_nan, dtype: bool 

0    True
Name: val_np_nat, dtype: bool 

0    True
Name: val_pd_nat, dtype: bool


The difference between `df.isna()` and `df.isnull()` is sourced from R and described in [this blog post](https://www.r-bloggers.com/2010/04/r-na-vs-null/). Looks like in python, it achieves the same goal.

In [14]:
# custom method exploiting built-in to_string()
def check_null(val):
    if str(val).lower() == 'none':
        return 1
    if str(val).lower() == 'nan':
        return 1
    if str(val).lower() == 'nat':
        return 1
    return 0

def lazy_isnan(val):
    if type(val) == list:
        return [check_null(e) for e in val]
    else:
        return check_null(val)

print('\nBy using custom method')
print('======================')
for val in test_cases:
    if not lazy_isnan(val):
        print('val type:\t', type(val))
        print('val:\t\t', val)
        print()


By using custom method
val type:	 <class 'int'>
val:		 1

val type:	 <class 'int'>
val:		 2

val type:	 <class 'int'>
val:		 3

val type:	 <class 'int'>
val:		 4

val type:	 <class 'int'>
val:		 5



In [15]:
%timeit [pd.isnull(val) for val in nulls]

1.06 µs ± 5.87 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [16]:
%timeit pd.isnull(nulls)

10.5 µs ± 23 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [17]:
%timeit [lazy_isnan(val) for val in nulls]

2.3 µs ± 7.59 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [18]:
%timeit lazy_isnan(nulls)

2.05 µs ± 6.66 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [19]:
# Cool! Now try to break it.