In [3]:
import numpy as np
import pandas as pd

In [4]:
data_dic = {
    "A": [1, 2, np.nan, 4, np.nan],  # only NaN can't be used. cause it will be considered as string
    "B": [np.nan, np.nan, np.nan, np.nan, np.nan],
    "C": [11, 12, 13, 14, 15],
    "D": [16, np.nan, 18, 19, 20]
}

df = pd.DataFrame(data_dic)
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       3 non-null      float64
 1   B       0 non-null      float64
 2   C       5 non-null      int64  
 3   D       4 non-null      float64
dtypes: float64(3), int64(1)
memory usage: 292.0 bytes


In [6]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,True,False,False
1,False,True,False,True
2,True,True,False,False
3,False,True,False,False
4,True,True,False,False


In [7]:
df.isnull().sum()  # how many NaN have in the column

A    2
B    5
C    0
D    1
dtype: int64

In [8]:
df.isnull().sum().sum()  # how many total NaN are there in the table 

np.int64(8)

In [10]:
df['A'].isnull()  # A columns NaN info

0    False
1    False
2     True
3    False
4     True
Name: A, dtype: bool

In [11]:
df['A'].isnull().sum()  # How mane NaN are there in the A column

np.int64(2)

In [12]:
df.isna()  # basically same as df.isnull()

Unnamed: 0,A,B,C,D
0,False,True,False,False
1,False,True,False,True
2,True,True,False,False
3,False,True,False,False
4,True,True,False,False


In [14]:
df.loc[1].isnull()

A    False
B     True
C    False
D     True
Name: 1, dtype: bool

In [15]:
df.loc[1].isnull().sum()

np.int64(2)

In [16]:
df.notnull()

Unnamed: 0,A,B,C,D
0,True,False,True,True
1,True,False,True,False
2,False,False,True,True
3,True,False,True,True
4,False,False,True,True


In [18]:
df.shape  # carefull: there is no bracket

(5, 4)

In [20]:
df.notna().sum()

np.int64(12)

In [21]:
df.notnull().sum().sum()

np.int64(12)

In [22]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [23]:
df['A'].sum()

np.float64(7.0)

In [24]:
df['A'].mean()

np.float64(2.3333333333333335)

In [25]:
df.loc[3].sum()  # row wise sum

np.float64(37.0)

In [27]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [28]:
df.dropna(axis = 0)  # works like "df.drop()". It will remove all the rows having at least 1 null

Unnamed: 0,A,B,C,D


In [29]:
df.dropna(axis = 1)

Unnamed: 0,C
0,11
1,12
2,13
3,14
4,15


In [30]:
df

# restriction on the quantity of null can also be given by "thresh = n" inside dropna()
# "thresh = 3" means series having >= 3 not null values will be selected

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [31]:
df.dropna(thresh = 3, axis = 1)

Unnamed: 0,A,C,D
0,1.0,11,16.0
1,2.0,12,
2,,13,18.0
3,4.0,14,19.0
4,,15,20.0


In [32]:
# all null values will be replaced
df.fillna(value = 2)

Unnamed: 0,A,B,C,D
0,1.0,2.0,11,16.0
1,2.0,2.0,12,2.0
2,2.0,2.0,13,18.0
3,4.0,2.0,14,19.0
4,2.0,2.0,15,20.0


In [33]:
# A column's null values will be replaced by the "mean" of A col
df['A'].fillna(value = df['A'].mean())

0    1.000000
1    2.000000
2    2.333333
3    4.000000
4    2.333333
Name: A, dtype: float64

In [34]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [37]:
df.fillna(method = 'ffill')  # forward fill (value will be copied forward to fill null)

  df.fillna(method = 'ffill')  # forward fill (value will be copied forward to fill null)


Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,16.0
2,2.0,,13,18.0
3,4.0,,14,19.0
4,4.0,,15,20.0


In [39]:
df.fillna(method = 'bfill')  # backward fill (value will be copied backward to fill null)

  df.fillna(method = 'bfill')  # backward fill (value will be copied backward to fill null)


Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,18.0
2,4.0,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [40]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,1.0,0.0,11,16.0
1,2.0,0.0,12,0.0
2,0.0,0.0,13,18.0
3,4.0,0.0,14,19.0
4,0.0,0.0,15,20.0


In [41]:
df

Unnamed: 0,A,B,C,D
0,1.0,,11,16.0
1,2.0,,12,
2,,,13,18.0
3,4.0,,14,19.0
4,,,15,20.0


In [None]:
# -------- remaining -------