# Handling Missing Data

In [1]:
import numpy as np 
import pandas as pd

## Missing Data in Pandas

### None as a Sentinel Value

In [6]:
values1 = np.array([1, 2, None, 4])
values1

array([1, 2, None, 4], dtype=object)

In [5]:
%timeit np.arange(1E6, dtype=int).sum()
%timeit np.arange(1E6, dtype=object).sum()

1.49 ms ± 12.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
45.2 ms ± 265 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
%xmode minimal
values1.sum()

Exception reporting mode: Minimal


TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

### NaN: Missing Numerical Data

In [9]:
values2 = np.array([1, 2, np.nan, 4])
values2

array([ 1.,  2., nan,  4.])

In [11]:
values2.dtype

dtype('float64')

In [13]:
1 + np.nan

nan

In [14]:
2 ** np.nan

nan

In [15]:
0 * np.nan

nan

In [17]:
values2.sum(), values2.mean(), values2.max()

(nan, nan, nan)

In [19]:
np.nansum(values2), np.nanmean(values2), np.nanmax(values2)

(7.0, 2.3333333333333335, 4.0)

### NaN and None in Pandas

In [20]:
pd.Series([1, np.nan, 3, None])

0    1.0
1    NaN
2    3.0
3    NaN
dtype: float64

In [21]:
pd.Series(["Koushik", np.nan, "Thomas", None])

0    Koushik
1        NaN
2     Thomas
3       None
dtype: object

Pandas handling of NAs by type
| Typeclass | Conversion when storing NAs | NA sentinel value |
|-----------|-----------------------------|-------------------|
| floating  | No change                   | np.nan            |
| object    | No change                   | None or np.nan    |
| integer   | Cast to float64             | np.nan            |
| boolean   | Cast to Object              | None or np.nan.   |

## Pandas Nullable Dtypes

In [25]:
pd.Series([1, np.nan, 2, None, pd.NA], dtype="Int32")

0       1
1    <NA>
2       2
3    <NA>
4    <NA>
dtype: Int32

## Operating on Null Values

### Detecting Null Values

In [31]:
data = pd.Series([1, np.nan, 'hello', None, pd.NA])

In [32]:
data.isnull()

0    False
1     True
2    False
3     True
4     True
dtype: bool

In [33]:
data.notna()

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [38]:
data[data.notnull()]

0        1
2    hello
dtype: object

### Dropping Null Values

In [39]:
data.dropna()

0        1
2    hello
dtype: object

In [40]:
df = pd.DataFrame([[1, np.nan, 2], [2, 3, 5], [np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [41]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [42]:
df.dropna(axis=1)

Unnamed: 0,2
0,2
1,5
2,6


In [43]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [46]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [47]:
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [48]:
df.dropna(axis='columns', how='any')

Unnamed: 0,2
0,2
1,5
2,6


In [54]:
df.dropna(axis='rows', thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


### Filling Null Values

In [55]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'), dtype='Int32')
data

a       1
b    <NA>
c       2
d    <NA>
e       3
dtype: Int32

In [56]:
data.fillna(0)

a    1
b    0
c    2
d    0
e    3
dtype: Int32

In [58]:
data.fillna(method='ffill')

a    1
b    1
c    2
d    2
e    3
dtype: Int32

In [60]:
data.fillna(method='bfill')

a    1
b    2
c    2
d    3
e    3
dtype: Int32

In [61]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [62]:
df.fillna(df.mean())

Unnamed: 0,0,1,2,3
0,1.0,3.5,2,
1,2.0,3.0,5,
2,1.5,4.0,6,


In [63]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,2.0,4.0,6,


In [65]:
df.fillna(method='ffill', axis='columns')

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


In [66]:
df.fillna(method='ffill', axis=1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0
