In [23]:
""" This notebook is created to create a model that predicts which passengers survived the Titanic shipwreck."""
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


In [4]:
# Read the train.csv file
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Read the test.csv file
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
"""Different Data Wrangling steps taken to clean the training data i.e. train_data"""
# Checking for missing values (NaN) in train_data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


From the DataFrame info , we could figure out that there are total **891** records. But the **Age**, **Cabin** & **Embarked** columns don't have 891 non-null entries

In [44]:
# Getting total Actual NaN / missing values in DataFrame by column wise
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

**Age** has **177**, 
**Cabin** has **687**,
**Embarked** has **2**,
Now, we will handle the Missing values for **Age** column

In [56]:
#plt.xlabel("Age")
#plt.ylabel("No.Of Passengers")
#plt.title("Age Vs Number of Passengers")
#train_data.Age.value_counts().plot.bar()
print('Oldest Passenger',train_data.Age.max(),'years','Youngest Passenger',train_data.Age.min(),'years','Mean Age',train_data.Age.mean(),'years')

Oldest Passenger 80.0 years Youngest Passenger 0.42 years Mean Age 29.69911764705882 years


We can assign the all missing values to mean age. But the range of the age column is from 0.42 to 80 Years. We can't assign mean age(around 30 years ) to 1 year baby passenger & also we can't assign age to other passengers as well. Because the age is also crucial feature to predictions. So, I am again filtering the data based on the **Name** column which contains Salutations( titles like Mr. Mrs.) in it. Based on that salutation we can further group the people & assign the mean Age of that group to missing values of respective salutations.

In [67]:
mod_train_data['Title']=train_data.Name.str.extract('([A-Za-z]+)\.')
mod_train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [72]:
# Now, group the people by Title & calculate the mean of that group and replace the missing values of that group repectively.
# Repeat the same process for all groups
mod_train_data.Title.value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Col           2
Major         2
Sir           1
Lady          1
Capt          1
Don           1
Ms            1
Jonkheer      1
Countess      1
Mme           1
Name: Title, dtype: int64

In the above output we have many Titles. In that list also some Titles are incorrect/miss spelled/different language true translation. We need to replace those incorrect Titles with relevant Titles.

In [75]:
mod_train_data['Title'].replace(['Dr','Rev','Mlle','Col','Major','Sir','Lady','Capt','Don','Ms','Jonkheer','Countess','Mme'],
                                ['Unknown','Mr','Miss','Mr','Mr','Mr','Mrs','Mr','Mr','Miss','Mrs','Mrs','Mrs'],inplace=True)
mod_train_data.Title.value_counts()

Mr         530
Miss       185
Mrs        129
Master      40
Unknown      7
Name: Title, dtype: int64

In [83]:
mod_train_data.groupby('Title')['Age'].mean()

Title
Master      4.574167
Miss       21.845638
Mr         32.879562
Mrs        35.892857
Unknown    42.000000
Name: Age, dtype: float64

In [90]:
mod_train_data.loc[(mod_train_data.Age.isnull()) & (mod_train_data.Title == 'Master'),'Age']=4.57
mod_train_data.loc[(mod_train_data.Age.isnull()) & (mod_train_data.Title == 'Miss'),'Age']=21.85
mod_train_data.loc[(mod_train_data.Age.isnull()) & (mod_train_data.Title == 'Mr'),'Age']=32.88
mod_train_data.loc[(mod_train_data.Age.isnull()) & (mod_train_data.Title == 'Mrs'),'Age']=35.89
mod_train_data.loc[(mod_train_data.Age.isnull()) & (mod_train_data.Title == 'Unknown'),'Age']=42

In [92]:
mod_train_data.Age.isnull().any()

False

With this, have completed the handling of missing values in **Age** column. Now, we handle the missing values in **Embarked** column.

In [97]:
mod_train_data.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

As per the description in the Data section in the **Embarked** column contains only 3 values S,C,Q. We can update the 2 missing values with **majority Embarked value**

In [98]:
mod_train_data.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

So, as per the above output. we will update the missing values of Embarked column with 'S'

In [101]:
mod_train_data.loc[mod_train_data.Embarked.isnull(),'Embarked']='S'
mod_train_data.Embarked.value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [102]:
mod_train_data.Embarked.isnull().any()

False

With this we have updated the missing values in Embarked column as well.