In [1]:
import numpy as np
import pandas as pd

**Finding Missing Data**

In [2]:
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [1, 2, 3, 4, 5],
    'C': [1, 2, 3, np.nan, np.nan],
    'D': [1, np.nan, np.nan, np.nan, 5]
}
df = pd.DataFrame(data)

In [3]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [4]:
df.isna()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,True
2,True,False,False,True
3,False,False,True,True
4,False,False,True,False


In [5]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,True
2,True,False,False,True
3,False,False,True,True
4,False,False,True,False


In [6]:
df.notnull()

Unnamed: 0,A,B,C,D
0,True,True,True,True
1,True,True,True,False
2,False,True,True,False
3,True,True,False,False
4,True,True,False,True


In [7]:
df.isna().sum()

A    1
B    0
C    2
D    3
dtype: int64

In [8]:
df.isnull().sum().sum()

np.int64(6)

In [9]:
df.isna().any()

A     True
B    False
C     True
D     True
dtype: bool

In [10]:
df.isna().any(axis=1)

0    False
1     True
2     True
3     True
4     True
dtype: bool

In [11]:
df[df.isna().any(axis=1)]

Unnamed: 0,A,B,C,D
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [12]:
df.notna().all()

A    False
B     True
C    False
D    False
dtype: bool

In [13]:
df.notna().all(axis=0)

A    False
B     True
C    False
D    False
dtype: bool

In [14]:
df[df.notna().any(axis=1)]

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [15]:
df.notna().all(axis=1)

0     True
1    False
2    False
3    False
4    False
dtype: bool

In [16]:
df[df.notna().all(axis=1)]

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0


**Removing Missing Data**

In [17]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [18]:
df2=df
df2

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [19]:
df2.dropna()

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0


In [20]:
df2

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [21]:
df2.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [22]:
df2.dropna(axis=1)

Unnamed: 0,B
0,1
1,2
2,3
3,4
4,5


In [23]:
#df2.dropna(inplace=True)

In [24]:
df2.dropna(thresh=1)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [25]:
df2.dropna(thresh=4,axis=1)

Unnamed: 0,A,B
0,1.0,1
1,2.0,2
2,,3
3,4.0,4
4,5.0,5


In [26]:
df2.dropna(thresh=3,axis=0)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
4,5.0,5,,5.0


**Filling the missing Data**

In [27]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [28]:
df.fillna(67)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,67.0
2,67.0,3,3.0,67.0
3,4.0,4,67.0,67.0
4,5.0,5,67.0,5.0


In [29]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [30]:
df.fillna(method='ffill')

  df.fillna(method='ffill')


Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,1.0
2,2.0,3,3.0,1.0
3,4.0,4,3.0,1.0
4,5.0,5,3.0,5.0


In [31]:
df.fillna(method='ffill',axis=0)

  df.fillna(method='ffill',axis=0)


Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,1.0
2,2.0,3,3.0,1.0
3,4.0,4,3.0,1.0
4,5.0,5,3.0,5.0


In [32]:
df.fillna(method='ffill',axis=1)

  df.fillna(method='ffill',axis=1)


Unnamed: 0,A,B,C,D
0,1.0,1.0,1.0,1.0
1,2.0,2.0,2.0,2.0
2,,3.0,3.0,3.0
3,4.0,4.0,4.0,4.0
4,5.0,5.0,5.0,5.0


In [33]:
df.fillna(method='bfill',axis=0)

  df.fillna(method='bfill',axis=0)


Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,5.0
2,4.0,3,3.0,5.0
3,4.0,4,,5.0
4,5.0,5,,5.0


In [34]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [35]:
df.fillna(method='bfill',axis=1)

  df.fillna(method='bfill',axis=1)


Unnamed: 0,A,B,C,D
0,1.0,1.0,1.0,1.0
1,2.0,2.0,2.0,
2,3.0,3.0,3.0,
3,4.0,4.0,,
4,5.0,5.0,5.0,5.0


In [36]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [37]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,3.0
2,3.0,3,3.0,3.0
3,4.0,4,2.0,3.0
4,5.0,5,2.0,5.0


In [38]:
df["B"].fillna(df["B"].median())

0    1
1    2
2    3
3    4
4    5
Name: B, dtype: int64

In [39]:
df["A"].fillna(df["C"].mean())

0    1.0
1    2.0
2    2.0
3    4.0
4    5.0
Name: A, dtype: float64

In [40]:
df.fillna(df.mode())

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,5.0
2,4.0,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [43]:
datafr = {
    'A': [1, 2, np.nan, 4, 6],
    'B': [1, 2, 4, 4, 5],
    'C': [1, 9, 3, np.nan, np.nan],
    'D': [1, np.nan, np.nan, np.nan, 5]
}
df3 = pd.DataFrame(datafr)
df3

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,9.0,
2,,4,3.0,
3,4.0,4,,
4,6.0,5,,5.0


In [44]:
df3.fillna(df3.mean())

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,9.0,3.0
2,3.25,4,3.0,3.0
3,4.0,4,4.333333,3.0
4,6.0,5,4.333333,5.0


In [46]:
df3['C'].fillna(df3["B"].median())

0    1.0
1    9.0
2    3.0
3    4.0
4    4.0
Name: C, dtype: float64

In [47]:
df3.fillna(df3.mean().iloc[0])

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,9.0,3.25
2,3.25,4,3.0,3.25
3,4.0,4,3.25,3.25
4,6.0,5,3.25,5.0


In [48]:
value={"A":890,"B":678,"C":34,"D":675}
df2.fillna(value=value)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,675.0
2,890.0,3,3.0,675.0
3,4.0,4,34.0,675.0
4,5.0,5,34.0,5.0
