In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

print("Setup complete")

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv
Setup complete


In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

# Using average age where age is unknown
train_data.Age = train_data.Age.fillna(train_data.Age.mean())
test_data.Age = test_data.Age.fillna(test_data.Age.mean())

# Dropping Unknown embarkation data as test data contains no unknown embarkations
# not including this step for now as it removes half the data
# train_data = test_data.loc[train_data.Embarked.notnull()]

# All other NA data filled as 'UTnknown'
train_data = train_data.fillna("Unknown")
test_data = test_data.fillna("Unknown")

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S


In [3]:
train_data.dtypes

# object types that we want to use as a feature must be converted to numerical

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [4]:
# Set variables
y = train_data.Survived   # variable to predict
titanic_features = ['Pclass','Sex','Age','SibSp','Parch','Embarked']

# Convert features with categorical data to numerical
X = pd.get_dummies(train_data[titanic_features])
test_X = pd.get_dummies(test_data[titanic_features])

#including Embarked Unknown column to allow X and test_X to match
test_X['Embarked_Unknown'] = 0

# model validation split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

test_X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_Unknown
0,3,34.5,0,0,False,True,False,True,False,0
1,3,47.0,1,0,True,False,False,False,True,0
2,2,62.0,0,0,False,True,False,True,False,0
3,3,27.0,0,0,False,True,False,False,True,0
4,3,22.0,1,1,True,False,False,False,True,0


In [5]:
# data fitting
titanic_model = DecisionTreeRegressor()
titanic_model.fit(train_X, train_y)

# predicting
pred_val = titanic_model.predict(val_X)

# mean absolute error
mae = mean_absolute_error(val_y, pred_val)
print(mae)

0.26211607834462325


In [6]:
# Deciding leaf nodes (over vs underfitting exercise)

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    titanic_model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state=1)
    titanic_model.fit(train_X, train_y)
    pred = titanic_model.predict(val_X)
    leaf_mae = mean_absolute_error(val_y, pred)
    return(leaf_mae)

In [7]:
# MAE values for various leaf nodes

leaf_nodes = [5, 10, 20, 30, 40, 50]

for max_leaf_nodes in leaf_nodes:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(f"Leaf nodes: {max_leaf_nodes}   \t MAE: {my_mae:.4f}")

best_nodes = 40

Leaf nodes: 5   	 MAE: 0.2920
Leaf nodes: 10   	 MAE: 0.2679
Leaf nodes: 20   	 MAE: 0.2586
Leaf nodes: 30   	 MAE: 0.2551
Leaf nodes: 40   	 MAE: 0.2421
Leaf nodes: 50   	 MAE: 0.2528


In [8]:
# final model with set leaf nodes

final_model = DecisionTreeRegressor(max_leaf_nodes = best_nodes, random_state = 1)

final_model.fit(X, y)
test_data['Survived'] = final_model.predict(test_X).round()

test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Unknown,Q,0.0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,Unknown,S,0.0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Unknown,Q,0.0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,Unknown,S,0.0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,Unknown,S,1.0


In [9]:
# Convert to CSV
test_data.Survived = test_data.Survived.astype('int64')

final_prediction = test_data[['PassengerId','Survived']]
final_prediction.to_csv('submission.csv', index=False)