In [1]:
import numpy as np
import pandas as pd

# Handling Missing Values
Missing values might be the most undesired values in data science.We definitely do not want to have them. 
1. Let’s start by creating a sample data frame and adding some missing values to it.

In [2]:
df=pd.DataFrame({
    "Date":pd.date_range(start="2021-10-01",periods=10,freq="D"),
    "Item":1014,
    "Measure_1":np.random.randint(1,10,size=10),
    "Measure_2":np.random.random(10).round(2),
    "Measure_3":np.random.random(10).round(2),
    "Measure_4":np.random.randn(10)
})

In [3]:
df

Unnamed: 0,Date,Item,Measure_1,Measure_2,Measure_3,Measure_4
0,2021-10-01,1014,4,0.91,0.29,0.172335
1,2021-10-02,1014,4,0.58,0.53,-0.311553
2,2021-10-03,1014,4,0.1,0.04,0.366114
3,2021-10-04,1014,3,0.07,0.6,-0.618547
4,2021-10-05,1014,9,0.81,0.42,-0.13408
5,2021-10-06,1014,8,0.15,0.42,-0.896923
6,2021-10-07,1014,5,0.03,0.06,-0.980379
7,2021-10-08,1014,5,0.0,0.96,-0.890098
8,2021-10-09,1014,5,0.34,0.05,-0.428644
9,2021-10-10,1014,1,0.62,0.25,-0.890692


In [4]:
#Adding few missing values
df.loc[[2,9],"Item"]=np.nan
df.loc[[2,7,9],"Measure_1"]=np.nan
df.loc[[2,3],"Measure_2"]=np.nan
df.loc[[2],"Measure_3"]=np.nan
df.loc[:6,"Measure_4"]=np.nan

In [5]:
df

Unnamed: 0,Date,Item,Measure_1,Measure_2,Measure_3,Measure_4
0,2021-10-01,1014.0,4.0,0.91,0.29,
1,2021-10-02,1014.0,4.0,0.58,0.53,
2,2021-10-03,,,,,
3,2021-10-04,1014.0,3.0,,0.6,
4,2021-10-05,1014.0,9.0,0.81,0.42,
5,2021-10-06,1014.0,8.0,0.15,0.42,
6,2021-10-07,1014.0,5.0,0.03,0.06,
7,2021-10-08,1014.0,,0.0,0.96,-0.890098
8,2021-10-09,1014.0,5.0,0.34,0.05,-0.428644
9,2021-10-10,,,0.62,0.25,-0.890692


With Pandas, an integer type missing value representation (<NA>) was introduced so we can have missing values in integer columns as well. However, we need to explicitly declare the data type.

In [6]:
df=df.astype({
    "Item":pd.Int64Dtype(),
    "Measure_1":pd.Int64Dtype()
})

In [7]:
df

Unnamed: 0,Date,Item,Measure_1,Measure_2,Measure_3,Measure_4
0,2021-10-01,1014.0,4.0,0.91,0.29,
1,2021-10-02,1014.0,4.0,0.58,0.53,
2,2021-10-03,,,,,
3,2021-10-04,1014.0,3.0,,0.6,
4,2021-10-05,1014.0,9.0,0.81,0.42,
5,2021-10-06,1014.0,8.0,0.15,0.42,
6,2021-10-07,1014.0,5.0,0.03,0.06,
7,2021-10-08,1014.0,,0.0,0.96,-0.890098
8,2021-10-09,1014.0,5.0,0.34,0.05,-0.428644
9,2021-10-10,,,0.62,0.25,-0.890692


# Method 1: Drop rows or columns that have a missing value

In [8]:
#One option is to drop the rows or columns that contain a missing value.
df.dropna()
#With the default parameter values, the dropna function drops the rows that contain any 
#missing value.

Unnamed: 0,Date,Item,Measure_1,Measure_2,Measure_3,Measure_4
8,2021-10-09,1014,5,0.34,0.05,-0.428644


In [9]:
#We can also choose to drop columns that have at least one missing value by using the 
#axis parameter.
df.dropna(axis=1)

Unnamed: 0,Date
0,2021-10-01
1,2021-10-02
2,2021-10-03
3,2021-10-04
4,2021-10-05
5,2021-10-06
6,2021-10-07
7,2021-10-08
8,2021-10-09
9,2021-10-10


In [10]:
#Another situation is to have a column or row that is full of missing values. 
#The dropna function can be used for this as well. We just need to change the value of 
#how parameter.
df.dropna(how="all")

Unnamed: 0,Date,Item,Measure_1,Measure_2,Measure_3,Measure_4
0,2021-10-01,1014.0,4.0,0.91,0.29,
1,2021-10-02,1014.0,4.0,0.58,0.53,
2,2021-10-03,,,,,
3,2021-10-04,1014.0,3.0,,0.6,
4,2021-10-05,1014.0,9.0,0.81,0.42,
5,2021-10-06,1014.0,8.0,0.15,0.42,
6,2021-10-07,1014.0,5.0,0.03,0.06,
7,2021-10-08,1014.0,,0.0,0.96,-0.890098
8,2021-10-09,1014.0,5.0,0.34,0.05,-0.428644
9,2021-10-10,,,0.62,0.25,-0.890692


In [11]:
#Drop rows or columns based on a threshold value
#For instance, “thresh=4” means that the rows that have at least 4 non-missing values will be kept. The other ones will be dropped.
df.dropna(thresh=4)

Unnamed: 0,Date,Item,Measure_1,Measure_2,Measure_3,Measure_4
0,2021-10-01,1014.0,4.0,0.91,0.29,
1,2021-10-02,1014.0,4.0,0.58,0.53,
3,2021-10-04,1014.0,3.0,,0.6,
4,2021-10-05,1014.0,9.0,0.81,0.42,
5,2021-10-06,1014.0,8.0,0.15,0.42,
6,2021-10-07,1014.0,5.0,0.03,0.06,
7,2021-10-08,1014.0,,0.0,0.96,-0.890098
8,2021-10-09,1014.0,5.0,0.34,0.05,-0.428644
9,2021-10-10,,,0.62,0.25,-0.890692


In [12]:
#Drop based on a particular subset of columns
#The subset parameter of the dropna function is used for this task. For instance, 
#we can drop the rows that have a missing value in measure 1 or measure 2 columns as follows:
df.dropna(subset=["Measure_2","Measure_3"])

Unnamed: 0,Date,Item,Measure_1,Measure_2,Measure_3,Measure_4
0,2021-10-01,1014.0,4.0,0.91,0.29,
1,2021-10-02,1014.0,4.0,0.58,0.53,
4,2021-10-05,1014.0,9.0,0.81,0.42,
5,2021-10-06,1014.0,8.0,0.15,0.42,
6,2021-10-07,1014.0,5.0,0.03,0.06,
7,2021-10-08,1014.0,,0.0,0.96,-0.890098
8,2021-10-09,1014.0,5.0,0.34,0.05,-0.428644
9,2021-10-10,,,0.62,0.25,-0.890692


# Method 2: Fill with a constant value

In [13]:
#We can choose a constant value to be used as a replacement for the missing values.
values={"Item":1014,"Measure_1":0}
df.fillna(value=values)

Unnamed: 0,Date,Item,Measure_1,Measure_2,Measure_3,Measure_4
0,2021-10-01,1014,4,0.91,0.29,
1,2021-10-02,1014,4,0.58,0.53,
2,2021-10-03,1014,0,,,
3,2021-10-04,1014,3,,0.6,
4,2021-10-05,1014,9,0.81,0.42,
5,2021-10-06,1014,8,0.15,0.42,
6,2021-10-07,1014,5,0.03,0.06,
7,2021-10-08,1014,0,0.0,0.96,-0.890098
8,2021-10-09,1014,5,0.34,0.05,-0.428644
9,2021-10-10,1014,0,0.62,0.25,-0.890692


# Method 3: Fill with an aggregated value

In [14]:
#Another option is to use an aggregated value such as mean, median, or mode.
df["Measure_2"].fillna(df["Measure_2"].mean())

0    0.91
1    0.58
2    0.43
3    0.43
4    0.81
5    0.15
6    0.03
7    0.00
8    0.34
9    0.62
Name: Measure_2, dtype: float64

In [15]:
#df["Measure_1"].fillna(df["Measure_1"].median())
from sklearn.impute import SimpleImputer
my_imputer=SimpleImputer(strategy='median')
my_imputer.fit_transform(np.array(df['Measure_3']).reshape(-1,1))

array([[0.29],
       [0.53],
       [0.42],
       [0.6 ],
       [0.42],
       [0.42],
       [0.06],
       [0.96],
       [0.05],
       [0.25]])

# Method 4: Replace with the previous or next value

In [16]:
#It is possible to replace the missing values in a column with the previous or next value 
# in that column.
#This method might come in handy when working with time-series data. 
#Consider you have a data frame that contains the daily temperature measurement
# and the temperate in one day is missing. 
#The optimal solution would be to use the temperature in the next or previous day.
df.fillna(method="bfill")

Unnamed: 0,Date,Item,Measure_1,Measure_2,Measure_3,Measure_4
0,2021-10-01,1014.0,4.0,0.91,0.29,-0.890098
1,2021-10-02,1014.0,4.0,0.58,0.53,-0.890098
2,2021-10-03,1014.0,3.0,0.81,0.6,-0.890098
3,2021-10-04,1014.0,3.0,0.81,0.6,-0.890098
4,2021-10-05,1014.0,9.0,0.81,0.42,-0.890098
5,2021-10-06,1014.0,8.0,0.15,0.42,-0.890098
6,2021-10-07,1014.0,5.0,0.03,0.06,-0.890098
7,2021-10-08,1014.0,5.0,0.0,0.96,-0.890098
8,2021-10-09,1014.0,5.0,0.34,0.05,-0.428644
9,2021-10-10,,,0.62,0.25,-0.890692


In [17]:
#we can limit the number of missing values replaced with this method. 
#If we set the limit parameter as 1, then a missing value can only be replaced with its next value. 
#The second or third following value will not be used for replacement.
df.fillna(method="bfill",limit=1)

Unnamed: 0,Date,Item,Measure_1,Measure_2,Measure_3,Measure_4
0,2021-10-01,1014.0,4.0,0.91,0.29,
1,2021-10-02,1014.0,4.0,0.58,0.53,
2,2021-10-03,1014.0,3.0,,0.6,
3,2021-10-04,1014.0,3.0,0.81,0.6,
4,2021-10-05,1014.0,9.0,0.81,0.42,
5,2021-10-06,1014.0,8.0,0.15,0.42,
6,2021-10-07,1014.0,5.0,0.03,0.06,-0.890098
7,2021-10-08,1014.0,5.0,0.0,0.96,-0.890098
8,2021-10-09,1014.0,5.0,0.34,0.05,-0.428644
9,2021-10-10,,,0.62,0.25,-0.890692


# Method 5: Fill by using another dataframe

We can also pass another data frame to the fillna function. The values in the new data frame will be used to replace the missing values in the current data frame.

The values will be selected according to the row indices and column names. For instance, if there is a missing value in the second row in the item column, the value in the same location in the new data frame will be used.

In [18]:
#Defining a new dataframe
df1=pd.DataFrame({
    "Date":pd.date_range(start="2021-10-01",periods=10,freq="D"),
    "Item":1014,
    "Measure_1":np.random.randint(1,10,size=10),
    "Measure_2":np.random.random(10).round(2),
    "Measure_3":np.random.random(10).round(2),
    "Measure_4":np.random.randn(10)
})
df1

Unnamed: 0,Date,Item,Measure_1,Measure_2,Measure_3,Measure_4
0,2021-10-01,1014,2,0.03,0.31,-1.357099
1,2021-10-02,1014,4,0.16,0.35,-1.211626
2,2021-10-03,1014,6,0.47,0.95,-0.466752
3,2021-10-04,1014,3,0.78,0.68,-0.570513
4,2021-10-05,1014,1,0.22,0.37,-0.056274
5,2021-10-06,1014,8,0.81,0.62,-0.087516
6,2021-10-07,1014,6,0.66,0.6,-0.090294
7,2021-10-08,1014,9,0.18,0.64,-1.712957
8,2021-10-09,1014,5,0.82,0.51,-0.000231
9,2021-10-10,1014,7,0.05,0.52,1.760629


In [19]:
#Using df1 to fill missing values in df
df.fillna(df1)

Unnamed: 0,Date,Item,Measure_1,Measure_2,Measure_3,Measure_4
0,2021-10-01,1014,4,0.91,0.29,-1.357099
1,2021-10-02,1014,4,0.58,0.53,-1.211626
2,2021-10-03,1014,6,0.47,0.95,-0.466752
3,2021-10-04,1014,3,0.78,0.6,-0.570513
4,2021-10-05,1014,9,0.81,0.42,-0.056274
5,2021-10-06,1014,8,0.15,0.42,-0.087516
6,2021-10-07,1014,5,0.03,0.06,-0.090294
7,2021-10-08,1014,9,0.0,0.96,-0.890098
8,2021-10-09,1014,5,0.34,0.05,-0.428644
9,2021-10-10,1014,7,0.62,0.25,-0.890692


# Method 6: Using Random Forest classifier to impute missing values

In [20]:
!pip install missingpy

Collecting missingpy
  Downloading missingpy-0.2.0-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: missingpy
Successfully installed missingpy-0.2.0
[0m

In [21]:
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest

In [22]:
df2=pd.read_csv('/kaggle/input/preprocessing/train.csv')

In [23]:
df2.drop("Name",axis=1,inplace=True)
df2.drop("Ticket",axis=1,inplace=True)
df2.drop("PassengerId",axis=1,inplace=True)
df2.drop("Cabin",axis=1,inplace=True)
df2.drop("Embarked",axis=1,inplace=True)

In [24]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2['Sex'] = le.fit_transform(df2['Sex'])

In [25]:
df2

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,3,1,22.0,1,0,7.2500,0
1,1,0,38.0,1,0,71.2833,1
2,3,0,26.0,0,0,7.9250,1
3,1,0,35.0,1,0,53.1000,1
4,3,1,35.0,0,0,8.0500,0
...,...,...,...,...,...,...,...
886,2,1,27.0,0,0,13.0000,0
887,1,0,19.0,0,0,30.0000,1
888,3,0,,1,2,23.4500,0
889,1,1,26.0,0,0,30.0000,1


In [26]:
imputer1=MissForest()
imputer1.fit_transform(df2)



Iteration: 0




Iteration: 1




Iteration: 2


array([[ 3.    ,  1.    , 22.    , ...,  0.    ,  7.25  ,  0.    ],
       [ 1.    ,  0.    , 38.    , ...,  0.    , 71.2833,  1.    ],
       [ 3.    ,  0.    , 26.    , ...,  0.    ,  7.925 ,  1.    ],
       ...,
       [ 3.    ,  0.    , 21.93  , ...,  2.    , 23.45  ,  0.    ],
       [ 1.    ,  1.    , 26.    , ...,  0.    , 30.    ,  1.    ],
       [ 3.    ,  1.    , 32.    , ...,  0.    ,  7.75  ,  0.    ]])