# Using machine learning model to handle missing values in a dataset

### To predict a categorical variable, a classification machine learning algorithm is required such as Logistic Regression, SVM, Naive Bayes, etc.

### To predict a continuous variable, a regression machine learning algorithm is required such as Linear Regression, SVR, etc.

In [1]:
#importing required libraries
import pandas as pd
#!pip3 install sklearn
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [2]:
#reading input data file
input_dataset = pd.read_csv("C:\\Users\\pc\\Documents\\edl\\data cleaning\\Kaggle_train.csv")

In [3]:
#checking sample records
input_dataset


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
# set seed for reproducibility
np.random.seed(0) 

In [5]:
#checking number of missing values in dataset 
input_dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [22]:
#imputing the null values in column age
before_imp_mean=input_dataset['Age'].mean()
before_imp_mean

0.36792055349407926

In [7]:
#plotting correlation matrix
input_dataset.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [8]:
#creating dummy column for categorical text columns Sex and Embark
cat_variables = input_dataset[['Sex', 'Embarked']]
cat_dummies = pd.get_dummies(cat_variables, drop_first=True)
cat_dummies.head()

Unnamed: 0,Sex_male,Embarked_Q,Embarked_S
0,1,0,1
1,0,0,0
2,0,0,1
3,0,0,1
4,1,0,1


In [9]:
#removing the text column and adding the encoded column to the original dataset 
input_dataset = input_dataset.drop(['Sex', 'Embarked'], axis=1)
input_dataset = pd.concat([input_dataset, cat_dummies], axis=1)
input_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,0,1
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,1,0,1


In [10]:
'''KNN Imptuer is a distance-based imputation method and it requires normalized data. Otherwise, the different 
scales of our data will lead the KNN Imputer to generate biased replacements for the missing values. For simplicity,
we will use Scikit-Learn’s MinMaxScaler which will scale our variables to have values between 0 and 1.'''


'KNN Imptuer is a distance-based imputation method and it requires normalized data. Otherwise, the different \nscales of our data will lead the KNN Imputer to generate biased replacements for the missing values. For simplicity,\nwe will use Scikit-Learn’s MinMaxScaler which will scale our variables to have values between 0 and 1.'

In [11]:
input_dataset=input_dataset.drop(['Name','Ticket','Cabin'],axis=1)
scaler = MinMaxScaler()
input_dataset = pd.DataFrame(scaler.fit_transform(input_dataset), columns = input_dataset.columns)
input_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0.0,0.0,1.0,0.271174,0.125,0.0,0.014151,1.0,0.0,1.0
1,0.001124,1.0,0.0,0.472229,0.125,0.0,0.139136,0.0,0.0,0.0
2,0.002247,1.0,1.0,0.321438,0.0,0.0,0.015469,0.0,0.0,1.0
3,0.003371,1.0,0.0,0.434531,0.125,0.0,0.103644,0.0,0.0,1.0
4,0.004494,0.0,1.0,0.434531,0.0,0.0,0.015713,1.0,0.0,1.0


In [16]:
imputer5=KNNImputer(n_neighbors=5)
df=pd.DataFrame(imputer.fit_transform(input_dataset),columns = input_dataset.columns)


In [23]:
imputer5=KNNImputer(n_neighbors=1)
df1=pd.DataFrame(imputer.fit_transform(input_dataset),columns = input_dataset.columns)


In [20]:
df['Age'].mean()

0.36580868301568664

In [24]:
df1['Age'].mean()

0.36580868301568664