In [1]:
import numpy as np
import pandas as pd

titanic = pd.read_csv('../data/titanic.csv')
titanic.drop(['Name', 'Ticket'], axis=1, inplace=True)
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,7.25,,S
1,2,1,1,female,38.0,1,0,71.2833,C85,C
2,3,1,3,female,26.0,0,0,7.925,,S
3,4,1,1,female,35.0,1,0,53.1,C123,S
4,5,0,3,male,35.0,0,0,8.05,,S


### Create indicator for Cabin

In [2]:
# i learned in my exploratory analysis, that the missing values for Cabin weren't - 
# missing at random and that when people did not have a cabin, they were much -
# less likely to survive
titanic['Cabin_ind'] = np.where(titanic['Cabin'].isnull(), 0, 1)
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Cabin_ind
0,1,0,3,male,22.0,1,0,7.25,,S,0
1,2,1,1,female,38.0,1,0,71.2833,C85,C,1
2,3,1,3,female,26.0,0,0,7.925,,S,0
3,4,1,1,female,35.0,1,0,53.1,C123,S,1
4,5,0,3,male,35.0,0,0,8.05,,S,0


### Convert Sex to numeric

In [3]:
# the next thing i will do is convert the sex feature from male vs. female, to numeric.

# for context, the ML model does not know what male vs.female really means. 
# all it knows is that there are two values for sex. converting them to numeric-
# is easier for most models to handle. 

# i will create a dictionary that will handle the gender to numeric mapping, and then will-
# apply that to the sex column using the .map method.

gender_num = {'male': 0, 'female': 1}

titanic['Sex'] = titanic['Sex'].map(gender_num)
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Cabin_ind
0,1,0,3,0,22.0,1,0,7.25,,S,0
1,2,1,1,1,38.0,1,0,71.2833,C85,C,1
2,3,1,3,1,26.0,0,0,7.925,,S,0
3,4,1,1,1,35.0,1,0,53.1,C123,S,1
4,5,0,3,0,35.0,0,0,8.05,,S,0


### Drop Cabin and Embarked

In [4]:
titanic.drop(['Cabin', 'Embarked'], axis=1, inplace=True)
titanic.head()

# now i have a clean numeric dataset that will be easier for a model to adjust. 

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin_ind
0,1,0,3,0,22.0,1,0,7.25,0
1,2,1,1,1,38.0,1,0,71.2833,1
2,3,1,3,1,26.0,0,0,7.925,0
3,4,1,1,1,35.0,1,0,53.1,1
4,5,0,3,0,35.0,0,0,8.05,0
