# Scikit-learn Decision Tree Model (Titanic Passenger Dataset)

This uses a variation of the [Titanic dataset](course_datasets.md#titanic).

This example is loosely based on aYouTube video by Mosh [here](https://www.youtube.com/watch?v=7eh4d6sabA0).

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
import joblib

In [5]:
titanic_url = 'https://raw.githubusercontent.com/MarkWilcock/CourseDatasets/main/Misc%20Datasets/Titanic%20Passenger.csv'
df = pd.read_csv(titanic_url) # read the data
df.head() # show the first 5 rows

Unnamed: 0,Passenger Id,Survival,Surname,Other Names,Title,Passenger Class,Gender,Embarked,FareBand,FamilySize,Age (bins),Age,Adult Or Child,Is Age Missing
0,1,Died,Brewe,Arthur Jackson,Dr,1st,male,Cherbourg,30 - above,1,,,Not Known,Missing
1,1,Survived,Fleming,Margaret,Miss,1st,female,Cherbourg,30 - above,1,,,Not Known,Missing
2,1,Died,Hoyt,William Fisher,Mr,1st,male,Cherbourg,30 - above,1,,,Not Known,Missing
3,1,Died,Lewy,Ervin G,Mr,1st,male,Cherbourg,20 - 30,1,,,Not Known,Missing
4,1,Survived,Marechal,Pierre,Mr,1st,male,Cherbourg,20 - 30,1,,,Not Known,Missing


In [6]:
df_slim = df.loc[:, ['Survival', 'Title','Passenger Class','Gender', 'Embarked','Adult Or Child']]
df_slim.columns = ['survival', 'title','class','gender', 'embarked','adult_or_child']
df_slim.head()

Unnamed: 0,survival,title,class,gender,embarked,adult_or_child
0,Died,Dr,1st,male,Cherbourg,Not Known
1,Survived,Miss,1st,female,Cherbourg,Not Known
2,Died,Mr,1st,male,Cherbourg,Not Known
3,Died,Mr,1st,male,Cherbourg,Not Known
4,Survived,Mr,1st,male,Cherbourg,Not Known


In [7]:
X_show = df_slim.drop('survival', axis=1)
X_show.head()

Unnamed: 0,title,class,gender,embarked,adult_or_child
0,Dr,1st,male,Cherbourg,Not Known
1,Miss,1st,female,Cherbourg,Not Known
2,Mr,1st,male,Cherbourg,Not Known
3,Mr,1st,male,Cherbourg,Not Known
4,Mr,1st,male,Cherbourg,Not Known


In [8]:
y = df_slim['survival'].apply(lambda x: 1 if x == 'Survived' else 0)
y.head()

0    0
1    1
2    0
3    0
4    1
Name: survival, dtype: int64

In [9]:
# By convention, we use X,y for the data and labels
X = pd.get_dummies(X_show) 
X   

Unnamed: 0,title_Capt,title_Col,title_Don,title_Dr,title_Jonkheer,title_Lady,title_Major,title_Master,title_Miss,title_Mlle,...,class_2nd,class_3rd,gender_female,gender_male,embarked_Cherbourg,embarked_Queenstown,embarked_Southampton,adult_or_child_Adult,adult_or_child_Child,adult_or_child_Not Known
0,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,False,True
1,False,False,False,False,False,False,False,False,True,False,...,False,False,True,False,True,False,False,False,False,True
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,...,False,True,False,True,False,True,False,True,False,False
887,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,True,False,False,True,False,False
888,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,True,False,False,True,False,False
889,False,False,False,False,False,False,False,False,False,False,...,False,True,False,True,False,False,True,True,False,False


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
(X_train.shape, y_train.shape), (X_test.shape, y_test.shape)

(((712, 28), (712,)), ((179, 28), (179,)))

In [12]:
model = DecisionTreeClassifier(max_depth=4)
model.fit(X_train, y_train)
joblib.dump(model, 'outputs/titanic_model.pkl')

['outputs/titanic_model.pkl']

In [13]:
predictions = model.predict(X_test)

In [14]:
accuracy_score(y_test, predictions)

0.7932960893854749

Install an extension such as GraphViz Interactive Preview, to vier teh decions tree model.

In [16]:
tree.export_graphviz(model, 
                      out_file='outputs/titanic_tree.dot', 
                      feature_names=X.columns, 
                      class_names=['Died', 'Survived'],
                      label='all',
                      rounded=True,
                      filled=True)
