In [41]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [42]:
data = pd.read_csv("../datasets/titanic/train.csv")

In [43]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [44]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [45]:
mod = data.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis=1)

In [46]:
mod.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
dtype: object

In [47]:
mod["Sex"] = LabelEncoder().fit_transform(mod["Sex"])

In [48]:
mod.dtypes

Survived      int64
Pclass        int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
dtype: object

In [49]:
mod.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,1,22.0,1,0
1,1,1,0,38.0,1,0
2,1,3,0,26.0,0,0
3,1,1,0,35.0,1,0
4,0,3,1,35.0,0,0


In [50]:
mod.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
count,891.0,891.0,891.0,714.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.699118,0.523008,0.381594
std,0.486592,0.836071,0.47799,14.526497,1.102743,0.806057
min,0.0,1.0,0.0,0.42,0.0,0.0
25%,0.0,2.0,0.0,20.125,0.0,0.0
50%,0.0,3.0,1.0,28.0,0.0,0.0
75%,1.0,3.0,1.0,38.0,1.0,0.0
max,1.0,3.0,1.0,80.0,8.0,6.0


In [51]:
mod_full = mod.dropna()

mod_blank = mod.loc[mod["Age"].isna()]

table = mod_full.groupby(["Sex", "Pclass"]).mean()[["Age"]].reset_index()

In [52]:
row = mod_blank.iloc[0]

table.loc[(table["Sex"] == row["Sex"]) &(table["Pclass"] == row["Pclass"])].iloc[0]["Age"]

26.507588932806325

In [53]:
def fillAge(row):
    row["Age"] = table.loc[(table["Sex"] == row["Sex"]) &(table["Pclass"] == row["Pclass"])].iloc[0]["Age"]
    return row

mod_filled = mod_blank.apply(fillAge, axis=1)

In [54]:
total = pd.concat([mod_full, mod_filled])

In [55]:
X = total.drop(["Survived"], axis=1)
y = total["Survived"]

In [56]:
X.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch'], dtype='object')

In [57]:
from sklearn.tree import DecisionTreeClassifier

In [58]:
model = DecisionTreeClassifier()

In [59]:
model.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [60]:
test_data = pd.read_csv("../datasets/titanic/test.csv")

In [61]:
test_mod = test_data.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis=1)

In [62]:
test_mod["Sex"] = LabelEncoder().fit_transform(test_mod["Sex"])

In [63]:
test_mod.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch
count,418.0,418.0,332.0,418.0,418.0
mean,2.26555,0.636364,30.27259,0.447368,0.392344
std,0.841838,0.481622,14.181209,0.89676,0.981429
min,1.0,0.0,0.17,0.0,0.0
25%,1.0,0.0,21.0,0.0,0.0
50%,3.0,1.0,27.0,0.0,0.0
75%,3.0,1.0,39.0,1.0,0.0
max,3.0,1.0,76.0,8.0,9.0


In [64]:
test_mod_full = test_mod.dropna()

test_mod_blank = test_mod.loc[mod["Age"].isna()]

test_table = test_mod_full.groupby(["Sex", "Pclass"]).mean()[["Age"]].reset_index()

In [65]:
def fillAge(row):
    row["Age"] = test_table.loc[(test_table["Sex"] == row["Sex"]) &(test_table["Pclass"] == row["Pclass"])].iloc[0]["Age"]
    return row

test_mod_filled = test_mod_blank.apply(fillAge, axis=1)

In [66]:
test_total = pd.concat([test_mod_full, test_mod_filled])

In [67]:
survival = model.predict(test_total)

In [68]:
result = pd.DataFrame(survival)

In [69]:
result.shape

(415, 1)