In [1]:
#Now we deal with the missing data which is one of the main concerns during the data analysis procedure
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

In [2]:
series_obj = Series(np.array([4,5.3,np.nan,-3.21]))
series_obj

0    4.00
1    5.30
2     NaN
3   -3.21
dtype: float64

In [8]:
frame_obj = DataFrame({'data1':[3,4.1,np.nan,-3], 'data2':[np.nan,5.4,np.nan,-34.53], 'data3':[np.nan,np.nan,14.3,0]})
frame_obj

Unnamed: 0,data1,data2,data3
0,3.0,,
1,4.1,5.4,
2,,,14.3
3,-3.0,-34.53,0.0


In [9]:
#The NaN values in Series or DataFrame structures serve as the missing data
#The None object from Python also serves as missing data in pandas and is displayed as NaN value
frame_obj.loc[2,'data3'] = None
frame_obj

Unnamed: 0,data1,data2,data3
0,3.0,,
1,4.1,5.4,
2,,,
3,-3.0,-34.53,0.0


In [31]:
#The various data operations to handle and filter missing data are discussed below:

#Drop missing values: dropna (Filtering Missing Data)
series_obj.dropna() #Removes all the NaN values in the series

0    4.00
1    5.30
3   -3.21
dtype: float64

In [32]:
#This is equivalent to using boolean indexing
series_obj[series_obj.notnull()]

0    4.00
1    5.30
3   -3.21
dtype: float64

In [33]:
#For DataFrames, the dropna method removes all the rows in which there is a NaN value
frame_obj.dropna()

Unnamed: 0,data1,data2,data3


In [34]:
#Using the how attribute, we can remove only the rows conatining only NaN values
#This helps as with only dropna, data is also lost
frame_obj.dropna(how='all')

Unnamed: 0,data1,data2,data3
0,3.0,,
1,4.1,5.4,
3,-3.0,-34.53,


In [35]:
frame_obj

Unnamed: 0,data1,data2,data3
0,3.0,,
1,4.1,5.4,
2,,,
3,-3.0,-34.53,


In [36]:
#Altering value of axis to 1, we can apply this operation to columns
frame_obj.loc[3,'data3'] = np.nan
frame_obj

Unnamed: 0,data1,data2,data3
0,3.0,,
1,4.1,5.4,
2,,,
3,-3.0,-34.53,


In [37]:
frame_obj.dropna(axis=1, how='all')

Unnamed: 0,data1,data2
0,3.0,
1,4.1,5.4
2,,
3,-3.0,-34.53


In [38]:
#Consider a time series DataFrame
time_df = DataFrame(np.random.randn(7,3))
time_df

Unnamed: 0,0,1,2
0,0.12869,-0.445732,0.834487
1,1.559486,-0.131278,-1.099088
2,0.257964,1.000837,0.772863
3,-1.526271,-0.798682,-1.468003
4,0.697468,0.175039,-0.51978
5,-0.767581,-1.765746,0.731645
6,0.329126,-0.994667,-1.071769


In [39]:
#Now lets assume there are a certain values that are missing in the dataframe
time_df.loc[:4,1] = np.nan
time_df.loc[:2,2] = np.nan
time_df

Unnamed: 0,0,1,2
0,0.12869,,
1,1.559486,,
2,0.257964,,
3,-1.526271,,-1.468003
4,0.697468,,-0.51978
5,-0.767581,-1.765746,0.731645
6,0.329126,-0.994667,-1.071769


In [40]:
#Using the thresh, we can get rows containing a certain number of values
#Suppose we want data that contains 2 observations at least
time_df.dropna(thresh=2)

Unnamed: 0,0,1,2
3,-1.526271,,-1.468003
4,0.697468,,-0.51978
5,-0.767581,-1.765746,0.731645
6,0.329126,-0.994667,-1.071769


In [42]:
#Filling Missing Values: fillna (Filling Missing Data)
time_df.fillna(0) #Fills all NaN values in dataframe with value 0

Unnamed: 0,0,1,2
0,0.12869,0.0,0.0
1,1.559486,0.0,0.0
2,0.257964,0.0,0.0
3,-1.526271,0.0,-1.468003
4,0.697468,0.0,-0.51978
5,-0.767581,-1.765746,0.731645
6,0.329126,-0.994667,-1.071769


In [43]:
#We can use a dictionary as an argument for fillna to indicate the value to put in the different columns
time_df.fillna({1: 0.5, 2:-1})

Unnamed: 0,0,1,2
0,0.12869,0.5,-1.0
1,1.559486,0.5,-1.0
2,0.257964,0.5,-1.0
3,-1.526271,0.5,-1.468003
4,0.697468,0.5,-0.51978
5,-0.767581,-1.765746,0.731645
6,0.329126,-0.994667,-1.071769


In [44]:
#Fillna returns a new object on the existing object it is applied to
#This feature can be disabled by using the inplace argument
time_df.fillna({1:0}, inplace=True)

In [45]:
time_df

Unnamed: 0,0,1,2
0,0.12869,0.0,
1,1.559486,0.0,
2,0.257964,0.0,
3,-1.526271,0.0,-1.468003
4,0.697468,0.0,-0.51978
5,-0.767581,-1.765746,0.731645
6,0.329126,-0.994667,-1.071769


In [51]:
#We can also use the interpolation methods for reindexing with fillna 
frame_obj2 = DataFrame(np.random.randn(6,3))
frame_obj2

Unnamed: 0,0,1,2
0,-2.434933,1.86709,0.03603
1,-1.23634,0.129464,0.341439
2,-0.90784,0.211344,1.116818
3,1.280054,0.494658,0.730609
4,-0.198626,0.819632,1.37628
5,0.190149,-0.931594,-0.422764


In [52]:
frame_obj2.loc[3:,1] = np.nan
frame_obj2.loc[:4,2] = np.nan
frame_obj2

Unnamed: 0,0,1,2
0,-2.434933,1.86709,
1,-1.23634,0.129464,
2,-0.90784,0.211344,
3,1.280054,,
4,-0.198626,,
5,0.190149,,-0.422764


In [53]:
#Using the fronfill interpolation argument
frame_obj2.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-2.434933,1.86709,
1,-1.23634,0.129464,
2,-0.90784,0.211344,
3,1.280054,0.211344,
4,-0.198626,0.211344,
5,0.190149,0.211344,-0.422764


In [55]:
frame_obj2.fillna(method='bfill')

Unnamed: 0,0,1,2
0,-2.434933,1.86709,-0.422764
1,-1.23634,0.129464,-0.422764
2,-0.90784,0.211344,-0.422764
3,1.280054,,-0.422764
4,-0.198626,,-0.422764
5,0.190149,,-0.422764


In [63]:
#We can also apply a limit argument to tell how many rows we have to fill:
frame_obj2.fillna(method='bfill', limit=3)

Unnamed: 0,0,1,2
0,-2.434933,1.86709,
1,-1.23634,0.129464,
2,-0.90784,0.211344,-0.422764
3,1.280054,,-0.422764
4,-0.198626,,-0.422764
5,0.190149,,-0.422764


In [64]:
#We can also apply different methods as fillna argument, to fill values with that respective value
#Suppose we want to fill the missing data with the mean value of the series

series_obj

0    4.00
1    5.30
2     NaN
3   -3.21
dtype: float64

In [65]:
series_obj.fillna(series_obj.mean())

0    4.00
1    5.30
2    2.03
3   -3.21
dtype: float64