In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dict1={'Names':['Ramesh','Suresh',np.nan,'Mahesh'],
'Age':[31,32,33,np.nan],
'City':[np.nan,'Hyd','Mumbai','Chennai']}

In [3]:
data1 = pd.DataFrame(dict1)
data1

Unnamed: 0,Names,Age,City
0,Ramesh,31.0,
1,Suresh,32.0,Hyd
2,,33.0,Mumbai
3,Mahesh,,Chennai


In [4]:
data1.isnull()

Unnamed: 0,Names,Age,City
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


In [5]:
data1.isnull().sum()

Names    1
Age      1
City     1
dtype: int64

In [6]:
data1.isnull().sum() / len(data1)

Names    0.25
Age      0.25
City     0.25
dtype: float64

In [7]:
data1.isnull().sum() * 100 / len(data1)

Names    25.0
Age      25.0
City     25.0
dtype: float64

In [8]:
dict2 = {'Names':['Ramesh','Suresh',None,'Mahesh'],
'Age':[31,32,33,None],
'City':[None,'Hyd','Mumbai','Chennai']}

In [9]:
data2 = pd.DataFrame(dict2)
data2

Unnamed: 0,Names,Age,City
0,Ramesh,31.0,
1,Suresh,32.0,Hyd
2,,33.0,Mumbai
3,Mahesh,,Chennai


In [10]:
data2.isnull()

Unnamed: 0,Names,Age,City
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


In [11]:
data2.isnull().sum()

Names    1
Age      1
City     1
dtype: int64

In [12]:
data2.isnull().sum()*100/len(data2)

Names    25.0
Age      25.0
City     25.0
dtype: float64

- If None is given in the data it will automatically take NaN for categorical columns

In [13]:
dict3 = {'Names':['Ramesh','Suresh','Null','Mahesh'],
'Age':[31,32,33,'Null'],
'City':['Null','Hyd','Mumbai','Chennai']}

In [14]:
data3 = pd.DataFrame(dict3)
data3

Unnamed: 0,Names,Age,City
0,Ramesh,31,Null
1,Suresh,32,Hyd
2,Null,33,Mumbai
3,Mahesh,Null,Chennai


- 'Null' is taken as string its not considered as null value

In [15]:
data3.isnull()

Unnamed: 0,Names,Age,City
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


$**METHOD - 1**$

> **Filling the missing values with Random numbers**

- dataframe name = data1

- method name = fillna

In [16]:
data1.fillna(40)

Unnamed: 0,Names,Age,City
0,Ramesh,31.0,40
1,Suresh,32.0,Hyd
2,40,33.0,Mumbai
3,Mahesh,40.0,Chennai


- **Problem is** even the categorical data is fill with 40

$**METHOD-2**$

> Fill the missing values with random number on specific column

- dataframe name= data1
- method name:fillna

In [21]:
data1['Names'].fillna('Satish')

0    Ramesh
1    Suresh
2    Satish
3    Mahesh
Name: Names, dtype: object

In [20]:
data1['Names'].fillna('Satish',inplace=True)
data1

Unnamed: 0,Names,Age,City
0,Ramesh,31.0,
1,Suresh,32.0,Hyd
2,Satish,33.0,Mumbai
3,Mahesh,,Chennai


In [37]:
#CREATE THE DICTIONARY/DATA AGAIN 
dict1={'Names':['Ramesh','Suresh',np.nan,'Mahesh'],
'Age':[31,32,33,np.nan],
'City':[np.nan,'Hyd','Mumbai','Chennai']}
data1=pd.DataFrame(dict1)

$**Method-3**$

- bfill
- ffill
- pad
- backfill

In [25]:
data1

Unnamed: 0,Names,Age,City
0,Ramesh,31.0,
1,Suresh,32.0,Hyd
2,,33.0,Mumbai
3,Mahesh,,Chennai


In [24]:
data1.fillna(method='backfill')
# Names index 2 has missed value
# it will replace by index 3 value {value of next index}
#Age index 3 has missed value
# we dont have index 4, so the value is NaN
# City index 0 has missed value
# it replace with index 1 value

Unnamed: 0,Names,Age,City
0,Ramesh,31.0,Hyd
1,Suresh,32.0,Hyd
2,Mahesh,33.0,Mumbai
3,Mahesh,,Chennai


In [26]:
data1.fillna(method='bfill')

Unnamed: 0,Names,Age,City
0,Ramesh,31.0,Hyd
1,Suresh,32.0,Hyd
2,Mahesh,33.0,Mumbai
3,Mahesh,,Chennai


- bfill and backfill both are same
- pad and ffill both are same

In [27]:
data1.fillna(method='pad')

Unnamed: 0,Names,Age,City
0,Ramesh,31.0,
1,Suresh,32.0,Hyd
2,Suresh,33.0,Mumbai
3,Mahesh,33.0,Chennai


In [28]:
data1.fillna(method='ffill')

Unnamed: 0,Names,Age,City
0,Ramesh,31.0,
1,Suresh,32.0,Hyd
2,Suresh,33.0,Mumbai
3,Mahesh,33.0,Chennai


$**Method-4**$

- Mean
- Median
- Mode

In [29]:
data1

Unnamed: 0,Names,Age,City
0,Ramesh,31.0,
1,Suresh,32.0,Hyd
2,,33.0,Mumbai
3,Mahesh,,Chennai


In [30]:
age_mean = data1['Age'].mean()
age_mean

32.0

In [36]:
data1['Age'].fillna(age_mean)

0    31.0
1    32.0
2    33.0
3    32.0
Name: Age, dtype: float64

In [35]:
data1

Unnamed: 0,Names,Age,City
0,Ramesh,31.0,
1,Suresh,32.0,Hyd
2,,33.0,Mumbai
3,Mahesh,32.0,Chennai


In [39]:
age_median = data1['Age'].median()
data1['Age'].fillna(age_median)

0    31.0
1    32.0
2    33.0
3    32.0
Name: Age, dtype: float64

In [40]:
age_median

32.0

> BY USING MEDIAN MISSING VALUES ARE FILLED AND OUTLIERS CAN BE REMOVED

In [41]:
age_mode = data1['Age'].mode()
data1['Age'].fillna(age_mode)

0    31.0
1    32.0
2    33.0
3     NaN
Name: Age, dtype: float64

$**Method-5**$

**K nearest neighbours**

- In the KNN imputer instead of taking mean of all the values
- Will choose neigbours data
- Will take those mean only

**KNN Imputer**

<img src="https://hlab.stanford.edu/brian/making7.gif" jsaction="VQAsE" class="sFlh5c pT0Scc iPVvYb" style="max-width: 668px; height: 393px; margin: 0px; width: 466px;" alt="Measuring Dis/Similarities" jsname="kn3ccd" aria-hidden="false">

- n_neighbors is parameter can choose by user
- if we dont choose by default it will takes as =5

In [47]:
from sklearn.impute import KNNImputer
knn = KNNImputer(n_neighbors=2)
knn.fit_transform(data1[['Age']])

array([[31.],
       [32.],
       [33.],
       [32.]])

In [48]:
# 2 neighbors means : 33 and 32
# mean of 33+32/2 = 32

$**Method-6**$

- Based on other columns, Some times all above methods will not provide good justification.
- At that time we need to check other columns dependancy also.
- Most of the time will pick a column which have highest correlation.