# Fill Missing Values Using Random Forest

In [1]:
## Import Libraries
import pandas as pd
import numpy as np

In [2]:
# Import Dataset
Data = pd.read_csv('C:/Users/medipalle.tendulkar/Desktop/Kaggle DataSets/Data.csv',index_col='PassengerId')
Data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Data Preprocessing

In [3]:
# Missing Values

Data.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

#### It looks like there are missing values in “Age”, “Embarked”, and “Cabin” columns.You must know that there is so many missing values in the “Cabin” column that I probably cannot do anything about it. I have to drop that column it in the next step.

In [4]:
Data = Data.drop(columns = ['Cabin'],axis=1)

In [5]:
Data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


#### I can attempt to predict the “Age” and “Embarked” columns. I think it is more interesting to predict the age, so let’s do that ;). Now it is time to split the Titanic dataset into two. The dataset which has non-empty age values will be used as the training data for the model.

In [6]:
# Lets check Embarked Feature
Data.Embarked.value_counts()


S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [7]:
# As 'S' is the Most frequent category iam going to replace the null values with Most frequent Value i.e, Median

Data['Embarked'].fillna(Data['Embarked'].value_counts().index[0], inplace = True)

In [8]:
DataWithAge = Data[pd.isnull(Data['Age']) == False]
DataWithoutAge = Data[pd.isnull(Data['Age'])]

In [9]:
DataWithAge.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [10]:
DataWithoutAge.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,Q
18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,S
20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,C
27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,C
29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,Q


In [11]:
print(DataWithAge.shape,DataWithoutAge.shape)

(714, 10) (177, 10)


In [12]:
# As we Focused on Filling Missing values iam selecting only features that are important.
Features = ['Survived','Pclass','Age','SibSp','Parch','Fare']

In [13]:
# Additionally, categorical variables must be encoded as numeric values. This task can be done using one-hot encoding

one_hot_embarked = pd.get_dummies(DataWithAge['Embarked'],drop_first=True)
one_hot_sex = pd.get_dummies(DataWithAge['Sex'],drop_first=True)
DataWithAge = DataWithAge[Features]
TrainSet = pd.concat([DataWithAge, one_hot_sex, one_hot_embarked], axis = 1)

one_hot_embarked = pd.get_dummies(DataWithoutAge['Embarked'],drop_first=True)
one_hot_sex = pd.get_dummies(DataWithoutAge['Sex'],drop_first=True)
DataWithoutAge = DataWithoutAge[Features]
TestSet = pd.concat([DataWithoutAge, one_hot_sex, one_hot_embarked], axis = 1)

In [14]:
TrainSet.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,22.0,1,0,7.25,1,0,1
2,1,1,38.0,1,0,71.2833,0,0,0
3,1,3,26.0,0,0,7.925,0,0,1
4,1,1,35.0,1,0,53.1,0,0,1
5,0,3,35.0,0,0,8.05,1,0,1


In [15]:
TestSet.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6,0,3,,0,0,8.4583,1,1,0
18,1,2,,0,0,13.0,1,0,1
20,1,3,,0,0,7.225,0,0,0
27,0,3,,0,0,7.225,1,0,0
29,1,3,,0,0,7.8792,0,1,0


In [16]:
TrainSet.columns

Index(['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'male', 'Q',
       'S'],
      dtype='object')

In [17]:
Independent_Features = ['Survived', 'Pclass', 'SibSp', 'Parch', 'Fare', 'male', 'Q','S']

# Model Buliding

In [18]:
# Now the crucial part. It is the time to train the Random Forest regressor and predict the values of the “Age” column
from sklearn.ensemble import RandomForestRegressor
rf_age = RandomForestRegressor()
# Training
rf_age.fit(TrainSet[Independent_Features], TrainSet['Age'])



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [19]:
# Prediction

Predicted_Ages = rf_age.predict(TestSet[Independent_Features])

In [20]:
Predicted_Ages

array([42.85055556, 35.97916667, 14.9       , 33.98904762, 18.7       ,
       27.4787528 , 36.16666667, 19.15      , 22.46666667, 33.444     ,
       31.494228  , 41.00333333, 19.15      , 24.48333333, 33.6       ,
       41.1       , 11.009     , 27.4787528 , 31.494228  , 19.15      ,
       31.494228  , 31.494228  , 27.4787528 , 26.44335664, 18.9       ,
       31.494228  , 50.64722222, 16.56666667, 29.35      , 29.97451441,
       25.18416667, 10.69333333, 35.        , 58.9       ,  4.23      ,
       13.25      , 28.2       , 42.9       , 19.7       , 50.64722222,
       19.15      , 10.69333333, 46.40055556, 27.4787528 ,  5.9       ,
       31.8       , 16.95      , 19.7       , 29.97451441, 39.1       ,
       50.64722222, 28.28333333, 49.75      , 19.15      , 33.92907563,
       58.9       , 41.1       , 41.05      , 19.15      , 28.1       ,
       27.        , 31.494228  , 26.63333333, 10.69333333, 21.5       ,
       33.56666667, 27.4787528 , 26.4       , 59.        , 33.98

In [21]:
# In the original dataset, the “Age” column contains only integers, so I am going to cast the generated values to “int” and replace the missing age values with data predicted by the model.
TestSet['Age'] = Predicted_Ages.astype(int)
Titanic_set = TrainSet.append(TestSet)

In [22]:
# Final Dataset with No Null Values in Age.
Titanic_set.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,male,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,22.0,1,0,7.25,1,0,1
2,1,1,38.0,1,0,71.2833,0,0,0
3,1,3,26.0,0,0,7.925,0,0,1
4,1,1,35.0,1,0,53.1,0,0,1
5,0,3,35.0,0,0,8.05,1,0,1


In [23]:
Titanic_set.shape

(891, 9)

In [24]:
Titanic_set.isnull().sum()

Survived    0
Pclass      0
Age         0
SibSp       0
Parch       0
Fare        0
male        0
Q           0
S           0
dtype: int64