In [1]:
import numpy as np
import pandas as pd
pd.isnull(np.nan)

True

In [2]:
pd.isnull(None)

True

In [3]:
pd.isna(np.nan)

True

In [4]:
pd.isna(None)

True

In [5]:
pd.notnull(None)

False

In [6]:
pd.notnull(np.nan)

False

In [7]:
pd.notnull(3)

True

In [8]:
pd.isnull(pd.Series([1,np.nan,7]))

0    False
1     True
2    False
dtype: bool

In [9]:
pd.notnull(pd.Series([1,np.nan,7]))

0     True
1    False
2     True
dtype: bool

In [10]:
pd.isnull(pd.DataFrame({"Column A":[1,np.nan,7],
                       "Column B":[np.nan,2,3],
                       "Column C":[np.nan,2,np.nan]
                       }))

Unnamed: 0,Column A,Column B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


## Filtering Missing Data Values

In [11]:
s=pd.Series([1,2,3,np.nan,np.nan,7])

In [12]:
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [13]:
pd.notnull(s).sum()

4

In [14]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    7.0
dtype: float64

In [15]:
pd.isnull(s).sum()

2

In [16]:
s[s.notnull()]

0    1.0
1    2.0
2    3.0
5    7.0
dtype: float64

## Droping null values

In [17]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    7.0
dtype: float64

In [18]:
s.dropna()

0    1.0
1    2.0
2    3.0
5    7.0
dtype: float64

## Dropping null values on DataFrames

In [19]:
df=pd.DataFrame({"Column A":[1,np.nan,30,np.nan],
                 "Column B":[2,8,3,np.nan],
                 "Column C":[np.nan,9,32,100],
                 "Column D":[5,8,34,110]
                })

In [20]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,3.0,32.0,34
3,,,100.0,110


In [21]:
df.isnull()

Unnamed: 0,Column A,Column B,Column C,Column D
0,False,False,True,False
1,True,False,False,False
2,False,False,False,False
3,True,True,False,False


In [22]:
df.isnull().sum()

Column A    2
Column B    1
Column C    1
Column D    0
dtype: int64

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
Column A    2 non-null float64
Column B    3 non-null float64
Column C    3 non-null float64
Column D    4 non-null int64
dtypes: float64(3), int64(1)
memory usage: 256.0 bytes


In [24]:
df.dropna()

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,3.0,32.0,34


In [25]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,3.0,32.0,34
3,,,100.0,110


In [26]:
df.dropna(axis=1)

Unnamed: 0,Column D
0,5
1,8
2,34
3,110


In [27]:
df.dropna(axis=0)

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,3.0,32.0,34


In [28]:
df.dropna(axis="columns")

Unnamed: 0,Column D
0,5
1,8
2,34
3,110


In [29]:
df.dropna(how="all")

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,3.0,32.0,34
3,,,100.0,110


In [30]:
df.dropna(how="any")

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,3.0,32.0,34


In [31]:
df.dropna(thresh=3)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,3.0,32.0,34


In [32]:
df.dropna(thresh=2)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,3.0,32.0,34
3,,,100.0,110


In [33]:
df.dropna(thresh=3,axis="columns")

Unnamed: 0,Column B,Column C,Column D
0,2.0,,5
1,8.0,9.0,8
2,3.0,32.0,34
3,,100.0,110


## Finding Null Values
- Sometimes instead of droping the null values, we might need to replace them with some other value. This highly depends on context and dataset. We have multiple choices to fill Null Values 
    - Replace with zero (0)
    - Replace with mean of the sample
    - Replace with closet value
    - Replace with frequency
        (It depends on the content and user need)

In [34]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    7.0
dtype: float64

In [35]:
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    7.0
dtype: float64

In [36]:
s.fillna(s.mean())

0    1.00
1    2.00
2    3.00
3    3.25
4    3.25
5    7.00
dtype: float64

In [37]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    7.0
dtype: float64

- Filling nulls with contiguous (closest) values

In [38]:
s.fillna(method="ffill")

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    7.0
dtype: float64

In [39]:
s.fillna(method="bfill")

0    1.0
1    2.0
2    3.0
3    7.0
4    7.0
5    7.0
dtype: float64

In [40]:
pd.Series([np.nan,3,np.nan,9]).fillna(method="ffill")

0    NaN
1    3.0
2    3.0
3    9.0
dtype: float64

In [41]:
pd.Series([1,np.nan,3,np.nan,np.nan]).fillna(method="bfill")

0    1.0
1    3.0
2    3.0
3    NaN
4    NaN
dtype: float64

In [42]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,3.0,32.0,34
3,,,100.0,110


In [43]:
df.fillna({"Column A":0,"Column B":99,"Column C":df["Column C"].mean()})

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,47.0,5
1,0.0,8.0,9.0,8
2,30.0,3.0,32.0,34
3,0.0,99.0,100.0,110


In [44]:
df.fillna(method="ffill",axis=0)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,3.0,32.0,34
3,30.0,3.0,100.0,110


In [45]:
df.fillna(method="ffill",axis=1)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,2.0,5.0
1,,8.0,9.0,8.0
2,30.0,3.0,32.0,34.0
3,,,100.0,110.0


In [46]:
df.fillna(method="bfill",axis=0)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,9.0,5
1,30.0,8.0,9.0,8
2,30.0,3.0,32.0,34
3,,,100.0,110


In [47]:
df.fillna(method="bfill",axis=1)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,5.0,5.0
1,8.0,8.0,9.0,8.0
2,30.0,3.0,32.0,34.0
3,100.0,100.0,100.0,110.0


### Finding unique Values

In [48]:
df=pd.DataFrame({
    "Sex":['M','F','F','D','?'],
    "Age":[29,30,24,290,25]
})

In [49]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [50]:
df["Sex"].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [51]:
df["Sex"].value_counts()

F    2
?    1
M    1
D    1
Name: Sex, dtype: int64

In [52]:
df.replace({
    "Sex":{'D':'F',
          'N':'M'},
    "Age":{290:29}
})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,?,25


### Duplicates

In [53]:
ambassadors=pd.Series(["France","United Kingdom","United Kingdom","Italy","Germany","Germany","Germany"],
                     index=["Gerord  Araud","Kim Dorroch","Peter Westmacott","ArmandoVarrichio","Peter Witiing","Peter Ammon","Klous Schorioth"])

In [54]:
ambassadors

Gerord  Araud               France
Kim Dorroch         United Kingdom
Peter Westmacott    United Kingdom
ArmandoVarrichio             Italy
Peter Witiing              Germany
Peter Ammon                Germany
Klous Schorioth            Germany
dtype: object

In [55]:
ambassadors.duplicated()

Gerord  Araud       False
Kim Dorroch         False
Peter Westmacott     True
ArmandoVarrichio    False
Peter Witiing       False
Peter Ammon          True
Klous Schorioth      True
dtype: bool

In [56]:
ambassadors.duplicated(keep="last")

Gerord  Araud       False
Kim Dorroch          True
Peter Westmacott    False
ArmandoVarrichio    False
Peter Witiing        True
Peter Ammon          True
Klous Schorioth     False
dtype: bool

In [57]:
ambassadors.duplicated(keep=False)

Gerord  Araud       False
Kim Dorroch          True
Peter Westmacott     True
ArmandoVarrichio    False
Peter Witiing        True
Peter Ammon          True
Klous Schorioth      True
dtype: bool

In [58]:
ambassadors.drop_duplicates()

Gerord  Araud               France
Kim Dorroch         United Kingdom
ArmandoVarrichio             Italy
Peter Witiing              Germany
dtype: object

In [59]:
ambassadors.drop_duplicates(keep="last")

Gerord  Araud               France
Peter Westmacott    United Kingdom
ArmandoVarrichio             Italy
Klous Schorioth            Germany
dtype: object

In [60]:
ambassadors.drop_duplicates(keep=False)

Gerord  Araud       France
ArmandoVarrichio     Italy
dtype: object