# Using Decision Trees on a dataset regarding titanic passagers

In [1]:
# import necessary libs 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# read the train and test dataset
train_data = pd.read_csv(r"datasets/titanic/train-data.csv")
test_data = pd.read_csv(r"datasets/titanic/test-data.csv")

In [3]:
# getting insight of data (shape, column, index, etc)
print('Shape of training data :',train_data.shape)
print('Shape of testing data :',test_data.shape)
print(train_data.columns)
train_data.head()

Shape of training data : (712, 25)
Shape of testing data : (179, 25)
Index(['Survived', 'Age', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3',
       'SibSp_4', 'SibSp_5', 'SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2',
       'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')


Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,28.5,7.2292,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,1,0,0
1,1,27.0,10.5,0,1,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
2,1,29.699118,16.1,0,0,1,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
3,0,29.699118,0.0,1,0,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1
4,0,17.0,8.6625,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


# Want to have better insight on who -Survived- 
## to simplify computing, we will only keep : Age, fare and sex columns

In [4]:
# Target variable is the "survived column" >> Y
# Separate target from other column for both sets
train_y = train_data['Survived']
test_y = test_data['Survived']

# FOR X SETS
#keep all columns (careful with computation time)
train_x = train_data.drop(columns=['Survived'],axis=1)
test_x = test_data.drop(columns=['Survived'],axis=1)

# keep few columns 
#train_x = train_data[['Age','Fare','Sex_female','Sex_male'
#train_x = train_data[['Age','Fare']]

#test_x = test_data[['Age','Fare','Sex_female','Sex_male']]
#test_x = test_data[['Age','Fare']]

## base model, no parameters modified

In [5]:
model = DecisionTreeClassifier()

In [6]:
# fitting
model.fit(train_x,train_y)

DecisionTreeClassifier()

In [7]:
print('Depth of the Decision Tree :', model.get_depth())

Depth of the Decision Tree : 19


In [8]:
# prediction on training data
predict_train = model.predict(train_x)
#print('Target on train data',predict_train)

# Accuracy Score on training data (prediction)
accuracy_train = accuracy_score(train_y,predict_train)
print('accuracy_score on train dataset : ', accuracy_train)

accuracy_score on train dataset :  0.9859550561797753


In [9]:
# prediction on testing data
predict_test = model.predict(test_x)
#print('Target on test data',predict_test) 

# Accuray Score dans les données de test
accuracy_test = accuracy_score(test_y,predict_test)
print('accuracy_score on test dataset : ', accuracy_test)

accuracy_score on test dataset :  0.7932960893854749


# Model is overfitting