# Missing Data

Let's show a few convenient methods to deal with Missing Data in pandas:

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'A':[1,2,np.nan,1,1,1],
                  'B':[5,np.nan,np.nan,2,2,2],
                  'C':[1,2,3,1,2,3]})

In [3]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3
3,1.0,2.0,1
4,1.0,2.0,2
5,1.0,2.0,3


In [4]:
df.isna()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False
3,False,False,False
4,False,False,False
5,False,False,False


In [5]:
df.isna().sum()

A    1
B    2
C    0
dtype: int64

You can also choose to use notna() which is just the opposite of isna().
df.isna().any() returns a boolean value for each column. If there is at least one missing value in that column, the result is True.
df.isna().sum() returns the number of missing values in each column.

In [6]:
df.isna().any()

A     True
B     True
C    False
dtype: bool

In [7]:
df.isna().any().count()

3

In [8]:
df.isna().sum()   

A    1
B    2
C    0
dtype: int64

In [10]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3
3,1.0,2.0,1
4,1.0,2.0,2
5,1.0,2.0,3


In [15]:
df['A'].isnull()

0    False
1    False
2     True
3    False
4    False
5    False
Name: A, dtype: bool

In [14]:
df['A'].isnull().sum()

1

In [16]:
df['B'].isnull()

0    False
1     True
2     True
3    False
4    False
5    False
Name: B, dtype: bool

In [17]:
df['B'].isnull().sum()

2

In [18]:
df.isna()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False
3,False,False,False
4,False,False,False
5,False,False,False


In [19]:
df.isnull()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False
3,False,False,False
4,False,False,False
5,False,False,False


In [9]:
df[df['A'].isnull()].index.tolist()  ## index means POSITION at which Null is Coming

[2]

In [11]:
df[df['A'].isnull()]

Unnamed: 0,A,B,C
2,,,3


In [12]:
df[df['B'].isnull()].index.tolist()

[1, 2]

In [None]:
df5=df.dropna()
df5

In [20]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3
3,1.0,2.0,1
4,1.0,2.0,2
5,1.0,2.0,3


In [27]:
# To drop na values

# DataFrame.dropna(axis=1)  To drop column wise
# DataFrame.dropna(axis=0)  To drop row wise

df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1
3,1.0,2.0,1
4,1.0,2.0,2
5,1.0,2.0,3


In [23]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3
3,1.0,2.0,1
4,1.0,2.0,2
5,1.0,2.0,3


In [32]:
df['A'].mean()

1.2

In [38]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3
3,1.0,2.0,1
4,1.0,2.0,2
5,1.0,2.0,3


In [39]:
df['A'].mean()

1.2

In [None]:
df

In [43]:
df.fillna(value=df['A'].mean(), inplace= True)

In [44]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,1.2,2
2,1.2,1.2,3
3,1.0,2.0,1
4,1.0,2.0,2
5,1.0,2.0,3


## To fill na values using mode

In [47]:
df = pd.DataFrame({'A':[1,2,np.nan,1,1,1],
                  'B':[5,np.nan,np.nan,2,2,2],
                  'C':[1,2,3,1,2,3]})

In [48]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3
3,1.0,2.0,1
4,1.0,2.0,2
5,1.0,2.0,3


In [51]:
df['C'].mode()

0    1
1    2
2    3
dtype: int64

In [54]:
df.fillna(df['C'].mode()[0])
#

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,1.0,2
2,1.0,1.0,3
3,1.0,2.0,1
4,1.0,2.0,2
5,1.0,2.0,3


# Great Job!