## Working with Missing Data in Pandas

In [1]:
import numpy as np
import pandas as pd

from pandas import DataFrame

### Filling missing values using fillna(), replace() and interpolate()

In [2]:
data = {'names': ['Steve', 'mary', 'joseph', 'Gabriel'],
        'age': [10, 20, 30, 40],
        'gender': ['male', 'female', 'male', 'male'],
        'rank': [2,5,1,3]
    }

ranking_df = DataFrame(data)
ranking_df.iloc[2:5, 1] = np.nan
ranking_df.iloc[2:1, 1] = np.nan
#set all columns to nan
ranking_df.iloc[3,:] = np.nan

ranking_df


Unnamed: 0,names,age,gender,rank
0,Steve,10.0,male,2.0
1,mary,20.0,female,5.0
2,joseph,,male,1.0
3,,,,


isNull  returns true for missing and notNull returns true for not null

In [3]:
ranking_df.isnull()

Unnamed: 0,names,age,gender,rank
0,False,False,False,False
1,False,False,False,False
2,False,True,False,False
3,True,True,True,True


In [4]:
ranking_df.notnull()

Unnamed: 0,names,age,gender,rank
0,True,True,True,True
1,True,True,True,True
2,True,False,True,True
3,False,False,False,False


masking to show missing values in column

In [5]:
bool_series = pd.isnull(ranking_df['age'])
ranking_df[bool_series]

Unnamed: 0,names,age,gender,rank
2,joseph,,male,1.0
3,,,,


fill missing values with a value

In [6]:
ranking_df.fillna(0)

Unnamed: 0,names,age,gender,rank
0,Steve,10.0,male,2.0
1,mary,20.0,female,5.0
2,joseph,0.0,male,1.0
3,0,0.0,0,0.0


replace the missing values with the previous  row values

In [12]:
ranking_df = ranking_df.ffill(inplace=True)

ranking_df

Unnamed: 0,names,age,gender,rank
0,Steve,10.0,male,2.0
1,mary,20.0,female,5.0
2,joseph,20.0,male,1.0
3,joseph,20.0,male,1.0


fill with the next values

In [13]:
ranking_df = ranking_df.bfill(inplace=True)

ranking_df

Unnamed: 0,names,age,gender,rank
0,Steve,10.0,male,2.0
1,mary,20.0,female,5.0
2,joseph,20.0,male,1.0
3,joseph,20.0,male,1.0


 Interpolate only numeric columns

In [16]:
# Select only numeric columns
numeric_cols = ranking_df.select_dtypes(include=['number']).columns

# Apply interpolation only to numeric columns
ranking_df[numeric_cols] = ranking_df[numeric_cols].interpolate(method='linear')


Drop columns that contain missing

In [17]:
ranking_df.dropna()

Unnamed: 0,names,age,gender,rank
0,Steve,10.0,male,2.0
1,mary,20.0,female,5.0
2,joseph,20.0,male,1.0
3,joseph,20.0,male,1.0


drop with keyword NAN

In [18]:
ranking_df.dropna(how='all')

Unnamed: 0,names,age,gender,rank
0,Steve,10.0,male,2.0
1,mary,20.0,female,5.0
2,joseph,20.0,male,1.0
3,joseph,20.0,male,1.0
