# <b>Chapter 16: Handling Missing Data</b>

## <b>Missing Data in Pandas</b>

### <b>None as a Sentinel Value</b>

In [2]:
import numpy as np
import pandas as pd
vals1=np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

In [3]:
#because Python does not support arithmetic operations with None, aggregations like sum or min will generally lead to an error:
# vals1.sum() #TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [4]:
%timeit np.arange(1E6, dtype=int).sum()

2.32 ms ± 176 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [5]:
%timeit np.arange(1E6, dtype=object).sum()

88.1 ms ± 9.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### <b>NaN: Missing Numerical Data</b>

In [6]:
# The other missing data representation, NaN (acronym for Not a Number), is different; it is a special floating-point value recognized by all systems that use the standard IEEE floating-point representation:
vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype

dtype('float64')

In [7]:
print(vals2)

[ 1. nan  3.  4.]


In [8]:
vals2.sum()

nan

In [9]:
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

In [10]:
u = np.array(['a', 'b', 'c'])
u.dtype
#output: dtype('<U3'): “Each item in this NumPy array is a Unicode string that can hold exactly 1 character.”

dtype('<U1')

In [25]:
arr = np.array([1, 2, None])
arr
# Output: array([1, 2, None], dtype=object)

array([1, 2, None], dtype=object)

In [26]:
arr = np.array([1, 2, np.nan])
# arr.dtype #dtype('float64')
arr
# Output: array([ 1.,  2., nan])


array([ 1.,  2., nan])

In [27]:
arr2 = np.array([1.0, 2.0, np.nan])
arr2.dtype
# Output: array([1, 2, None], dtype=object)

dtype('float64')

In [12]:
type(u)

numpy.ndarray

In [28]:
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

In [33]:
type(np.nan)

float

In [39]:
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [36]:
x= pd.Series(range(2), dtype=int)
print(x)
x[0]=None
x

0    0
1    1
dtype: int32


0    NaN
1    1.0
dtype: float64

## <b>Pandas Nullable Dtypes</b>

In [51]:
np.array([1, np.nan, 2, None])

array([1, nan, 2, None], dtype=object)

In [47]:
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [45]:
pd.Series([1, np.nan, 2, None, pd.NA])

0       1
1     NaN
2       2
3    None
4    <NA>
dtype: object

In [43]:
pd.Series([1, np.nan, 2, None], dtype='Int32')

0       1
1    <NA>
2       2
3    <NA>
dtype: Int32

In [42]:
pd.Series([1, np.nan, 2, None, pd.NA], dtype='Int32')

0       1
1    <NA>
2       2
3    <NA>
4    <NA>
dtype: Int32

## <b>Operating on Null Values</b>

In [52]:
"""
isnull()
Generate a Boolean mask indicating missing values
notnull()
Opposite of isnull()
dropna()
Return a filtered version of the data
fillna()
Return a copy of the data with missing values filled or imputed
"""

'\nisnull()\nGenerate a Boolean mask indicating missing values\nnotnull()\nOpposite of isnull()\ndropna()\nReturn a filtered version of the data\nfillna()\nReturn a copy of the data with missing values filled or imputed\n'

### <b>Detecting Null Values</b>

In [69]:
data=pd.Series([1, np.nan, 'hello', None])
data
# 0        1
# 1      NaN
# 2    hello
# 3     None
# dtype: object
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [60]:
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [61]:
data[data.notnull()]

0        1
2    hello
dtype: object

### <b>Dropping Null Values</b>

In [62]:
data.dropna()

0        1
2    hello
dtype: object

In [87]:
df = pd.DataFrame([[1, pd.NA, 2], [2, 3, 5], [np.nan, 4, 6]], columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [94]:
j = df.iloc[0,1]
type(j)

pandas._libs.missing.NAType

In [95]:
df = pd.DataFrame([[1, np.nan, 2], [2, 3, 5], [np.nan, 4, 6]])
df.dtypes

0    float64
1    float64
2      int64
dtype: object

In [98]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [96]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [97]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [101]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [108]:
df.isnull()

Unnamed: 0,0,1,2,3
0,False,True,False,True
1,False,False,False,True
2,True,False,False,True


In [102]:
df[2]

0    2
1    5
2    6
Name: 2, dtype: int64

In [104]:
df.iloc[:, 2]

0    2
1    5
2    6
Name: 2, dtype: int64

In [105]:
df.dropna(axis= 1, how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [106]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [107]:
df.dropna(axis='rows', thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


### <b> Filling null values </b> 

In [116]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'), dtype='Int32')
data

a       1
b    <NA>
c       2
d    <NA>
e       3
dtype: Int32

In [114]:
# data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'), dtype='Int32')
# data


# a       1
# b    <NA>
# c       2
# d    <NA>
# e       3
# dtype: Int32

a       1
b    <NA>
c       2
d    <NA>
e       3
dtype: Int32

In [113]:
# data = pd.Series([1, pd.NA, 2, pd.NA, 3], index=list('abcde'), dtype='Int32')
# data


# a       1
# b    <NA>
# c       2
# d    <NA>
# e       3
# dtype: Int32

a       1
b    <NA>
c       2
d    <NA>
e       3
dtype: Int32

In [117]:
data

a       1
b    <NA>
c       2
d    <NA>
e       3
dtype: Int32

In [118]:
data.fillna(0)

a    1
b    0
c    2
d    0
e    3
dtype: Int32

In [120]:
data.fillna(method='ffill') # forward fill

  data.fillna(method='ffill') # forward fill


a    1
b    1
c    2
d    2
e    3
dtype: Int32

In [121]:
data.fillna(method='bfill')  # backword filling

  data.fillna(method='bfill')  # backword filling


a    1
b    2
c    2
d    3
e    3
dtype: Int32

In [122]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [123]:
df.fillna(method='ffill', axis=1)

  df.fillna(method='ffill', axis=1)


Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


In [124]:
df.fillna(method='ffill', axis=0)

  df.fillna(method='ffill', axis=0)


Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,2.0,4.0,6,
