In [1]:
import sys
assert sys.version_info >= (3, 5)

In [2]:
import numpy as np
import os

In [3]:
HOUSING_PATH = os.path.join("datasets")

In [4]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "titanic.csv")
    return pd.read_csv(csv_path)

In [5]:
titanic = load_housing_data()
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
titanic["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [7]:
passengerCount = titanic["PassengerId"].count()

In [8]:
survivedValues = titanic["Survived"].value_counts()
survived = survivedValues[1];
died = survivedValues[0];
100 * survived / passengerCount

38.38383838383838

In [9]:
classValues = titanic["Pclass"].value_counts()
classValues

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [10]:
100 * classValues[1] / passengerCount

24.242424242424242

In [11]:
meanAge = titanic["Age"].mean()
medianAge = titanic["Age"].median()
print(meanAge, medianAge)

29.69911764705882 28.0


In [12]:
titanic.corr(method='pearson')

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [13]:
female_df = titanic[titanic['Sex'] == 'female']
female_names_df = female_df['Name']
def selectFirstName(x):
    bracketIdx = x.find('(')
    # has brackets take first name from there
    if (bracketIdx > 0):
        return x[bracketIdx + 1:x.find(' ', bracketIdx)]

    dotIdx = x.find('. ')
    endingWordSpaceIdx = x.find(' ', dotIdx + 2)
    firstNameEndIdx = len(x)
    
    if (endingWordSpaceIdx > 0):
        firstNameEndIdx = endingWordSpaceIdx
    
    return x[dotIdx + 2:firstNameEndIdx]

female_first_names_df = female_names_df.apply(selectFirstName)
female_first_names_df.value_counts()

Anna         15
Mary         14
Elizabeth    11
Margaret     10
Alice         6
             ..
Jeannie       1
Gretchen      1
Hileni        1
Simonne       1
Amelie        1
Name: Name, Length: 178, dtype: int64

In [14]:
from sklearn.tree import DecisionTreeClassifier

In [31]:
tree_fd = titanic[['Pclass', 'Fare', 'Age', 'Sex', 'Survived']]
tree_without_nan_df = tree_fd.dropna()
y = tree_without_nan_df['Survived'];


In [67]:
sex_list = tree_without_nan_df['Sex'].map({'male': 1, 'female': 0 })
X = tree_without_nan_df[['Pclass', 'Fare', 'Age']]
X = X.assign(Sex=sex_list)
X

Unnamed: 0,Pclass,Fare,Age,Sex
0,3,7.2500,22.0,1
1,1,71.2833,38.0,0
2,3,7.9250,26.0,0
3,1,53.1000,35.0,0
4,3,8.0500,35.0,1
...,...,...,...,...
885,3,29.1250,39.0,0
886,2,13.0000,27.0,1
887,1,30.0000,19.0,0
889,1,30.0000,26.0,1


In [61]:
y

0      0
1      1
2      1
3      1
4      0
      ..
885    0
886    0
887    1
889    1
890    0
Name: Survived, Length: 714, dtype: int64

In [68]:
clf = DecisionTreeClassifier(random_state=241)
clf.fit(X, y)

DecisionTreeClassifier(random_state=241)

In [69]:
importances = clf.feature_importances_
importances

array([0.14000522, 0.30343647, 0.2560461 , 0.30051221])