In [1]:
# 02-Missing-Data-01

In [3]:
# The way in which pandas handles missing values is contrained by its reliance on 
# the NumPy package, which does not have a build-in notion of NA values for 
# non-floating-point datatypes.

# Pandas chose to use sentinels for missing daa, and further chose to use two already
# existing Python null values : the special floating point NaN value, and the Pythn  None object.


In [5]:
# Pythonic missing data

# The first sentinel value used by Pandas in None, a Python singleton object that is often used for missing data in Pthon code.
# Because it is a Python object, None canot be used in any arbitrary NumPy/Pandas array,
# ut only in arrays with data tye 'object' (i.e. array of Python objects ).

In [7]:
x = [1,2,None,4,5]
x

[1, 2, None, 4, 5]

In [8]:
x[0]

1

In [9]:
x[2]

In [10]:
x = ['Apple',None,'Mango','None']
x

['Apple', None, 'Mango', 'None']

In [11]:
x[1]

In [12]:
x[3]

'None'

In [13]:
#-----------------------
import numpy as np
import pandas as pd
x1 = np.array([1,2,None,4])
print(x1)
print(x1.dtype)

[1 2 None 4]
object


In [15]:
# Missing numerical data
# The other mising data representation, NaN ( acronym for Not a number) is different;
# it is a special floating-point value recognized by all systems that use 
# the standard IEEE floating-point representation.

In [16]:
x2 = np.array([1,2,np.nan,4])
print(x2)
print(x2.dtype)

[ 1.  2. nan  4.]
float64


In [17]:
# Notice that NumPy chose a native floating-point type for this array:
# this means that unlike the object array from before,
# this array supports ast operations pushed into compiled code.
# You should be aware that NaN is a bit like a data virus-it infects any other object it touches.

# Regardless of the operation, the result o farithmetic with NaN will be another NaN:


In [18]:
print(1+np.nan)

nan


In [19]:
print(0*np.nan)

nan


In [20]:
print(x2.sum())

nan


In [21]:
print(x2.min())

nan


In [22]:
print(x2.max())

nan


In [23]:
# Numpy does provide some special aggregations that will ignore these missing values:

In [24]:
print(np.nansum(x2))

7.0


In [25]:
print(np.nanmin(x2))

1.0


In [26]:
print(np.nanmax(x2))

4.0


In [None]:
# Note : NaN is specifically a floating point value, there is no equivalent 
# NaN value for integers, strings, or other types.

In [27]:
# NaN and None in Pandas
# NaN and None both have their place, and Pandas is built to handle the two of
# them nearly interchangeably, converting between them where appropriate:

In [28]:
x3 = pd.Series([1,np.nan,2,None])
print(x3)

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64


In [29]:
# For types that don't have an available sentinel value, 
# Pandas automatically type-casts when NA values are present.

# For example, if we set a value in an integer array to np.nan, 
# it will automatically be upcast to a floating-point type to accommodate the NA:


In [30]:
x4 = pd.Series(range(2), dtype=int)
x4

0    0
1    1
dtype: int32

In [31]:
x4[0] = None
x4

0    NaN
1    1.0
dtype: float64

In [32]:
# Notice that in addition to casting the integer array to floating point, 
# Pandas automatically converts the None to a NaN value.

In [33]:
# -----------------------------------------
#  Operating on Null values
#------------------------------------------
# As we have seen, Pandas treats None and NaN as essentially interchangeable
# for indicating missing or null values. 
# To facilitate this convention, there are several use useful methods for detecting, removing,
# removing, and replacing null values in Pandas data structures.

# They are :
# isnull() : Generate a boolean mask indicating missing values
# notnull(): opposite of isnull()
# dropna(): Return a filtered version of the data
# fillna(): Return a copy of the data with missing values filled or imputed

In [34]:
# Detecting null values
# Pandas data structures have two useful methods for detecting null data: isnull() and notnull().
# Either one will return a Boolean mask over the data.
# For example :

x = pd.Series([1,np.nan,'hello',None])
print(x.isnull())

0    False
1     True
2    False
3     True
dtype: bool


In [35]:
print(x[x.notnull()]) # displays non null values

0        1
2    hello
dtype: object


In [36]:
# Dropping null values
# In adition to the masking used before, there are the convenience methods,
# dropna() ( which emoves NA values ) and 
# fillna() ( which fills in NA vlues ).

In [37]:
# For a Series, the result is straightforward:
x = pd.Series([1,np.nan,'hello',None])
print(x.dropna())

0        1
2    hello
dtype: object


In [40]:
# For a DataFrame, there are more options.
# Consider the following DataFrame:

df = pd.DataFrame([[1,np.nan,2],
                  [2,3,5],
                  [np.nan,4,6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [41]:
# We cannot drop single values from a DataFrame;
# We can only drop full rows or full columns.
# Depending on the application, you might want one or the other,
# so dropna() gives a number of ptions for DataFrame.

In [42]:
# By default, dropna() will drop all rows in which any null value is present
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [43]:
# Alternatively, we can drop NA values along a different axis;
# axis=1 drops all columns containg a null columns containing a null value:

print(df.dropna(axis='columns'))

   2
0  2
1  5
2  6


In [45]:
# But this drops some good data as well; 
# you might rather be interested in dropping rows or columns with all NA values,
# or a majority of NA values.
# This can be specified through the how or thresh parameters, 
# which allow fine control of the number of nulls to allow through.
# The default is now='any', such that any row or column ( depending on the axiskeyword)
# containing a null value will be dropped.
# we can also specify how='all', which will only drop rows/columns that are all null values:

df[3] = np.nan
print(df)

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN


In [46]:
print(df.dropna(axis='columns',how='all'))

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6


In [47]:
print(df.dropna(axis='columns', how='any'))

   2
0  2
1  5
2  6


In [50]:
# For finer-grained control, the thresh parameter lets you specify a minimum number of 
# non-null values for the row/column to be kept:

print(df.dropna(axis='rows', thresh=3))

# Here the first and last row have been dropped,
# because they contain only two non-null values.

     0    1  2   3
1  2.0  3.0  5 NaN


In [51]:
print(df.dropna(axis='rows', thresh=2))

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN


In [52]:
print(df.dropna(axis='columns', thresh=2))

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6


In [53]:
# Filling null values
# Sometimes rather than dropping NA values, you'd rather replace them with a valid value.
# This Value might be a single number like zero, 
# or it might be some sort of imputation or interpolation from the good values.
# You could do this inplace using the isnull() method as a mask, 
# but becasue it is such a common operation Pandas provides the fillna() method, 
# which returns a copy of the array with the null values replaced.

data = pd.Series([1,np.nan,2,None,3], index=list('abcde'))
print(data)

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64


In [60]:
# We can fill NA entries with a single value, such a zero:    
print(data.fillna(0))

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64


In [62]:
# we can specify a forward-fill to propagate the pevious value forward:
print(data.fillna(method = 'ffill'))

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64


In [63]:
# Or we can specify a back-fill to propagate the next values backward:
print(data.fillna(method='bfill'))

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64


In [64]:
# For DataFrames, the options are similar, but we can also specify an axis 
# along which the fills take place:

df = pd.DataFrame([[1,np.nan,2],
                  [2,3,5],
                  [np.nan,4,6]])
df[3]=np.nan
print(df)

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN


In [67]:
print(df.fillna(method='ffill', axis=1))

# Notice that if a previous value is not available during a forward fill,
# the NA value remains the same.

     0    1    2    3
0  1.0  1.0  2.0  2.0
1  2.0  3.0  5.0  5.0
2  NaN  4.0  6.0  6.0


In [68]:
print(df.fillna(method='bfill',axis=1))

     0    1    2   3
0  1.0  2.0  2.0 NaN
1  2.0  3.0  5.0 NaN
2  4.0  4.0  6.0 NaN


In [69]:
print(df.fillna(method='ffill',axis=0))

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  2.0  4.0  6 NaN


In [70]:
print(df.fillna(method='bfill',axis=0))

     0    1  2   3
0  1.0  3.0  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN
