# DECISION TREE IMPLEMENTATION ON TITANIC DATASET

### Exploring the dataset of passangers in titanic and based on the model, predicting the chances of survival of passenger

#### Importing libraries and loading dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


In [39]:
titanic = pd.read_csv("titanic.csv")

In [40]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Data cleaning
- removing useless features
- labeling/one hot encoding
- handling mission points

In [41]:
titanic.drop(['PassengerId', 'SibSp', 'Parch', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

In [42]:
titanic

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.2500
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.9250
3,1,1,female,35.0,53.1000
4,0,3,male,35.0,8.0500
...,...,...,...,...,...
886,0,2,male,27.0,13.0000
887,1,1,female,19.0,30.0000
888,0,3,female,,23.4500
889,1,1,male,26.0,30.0000


In [43]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   Fare      891 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 34.9+ KB


In [44]:
titanic.Pclass.value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [45]:
titanic.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

Handling missing data in the Age column of dataset

In [47]:
titanic.Age = titanic.Age.fillna(titanic.Age.mean())

In [48]:
titanic

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.000000,7.2500
1,1,1,female,38.000000,71.2833
2,1,3,female,26.000000,7.9250
3,1,1,female,35.000000,53.1000
4,0,3,male,35.000000,8.0500
...,...,...,...,...,...
886,0,2,male,27.000000,13.0000
887,1,1,female,19.000000,30.0000
888,0,3,female,29.699118,23.4500
889,1,1,male,26.000000,30.0000


In [49]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   Fare      891 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 34.9+ KB


Labelling the sex column

In [50]:
titanic.Sex = titanic.Sex.map({'male':1, 'female':2})

In [51]:
titanic.sample(10)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
91,0,3,1,20.0,7.8542
614,0,3,1,35.0,8.05
35,0,1,1,42.0,52.0
159,0,3,1,29.699118,69.55
255,1,3,2,29.0,15.2458
575,0,3,1,19.0,14.5
809,1,1,2,33.0,53.1
149,0,2,1,42.0,13.0
61,1,1,2,38.0,80.0
306,1,1,2,29.699118,110.8833


### Dividing the columns into input and target

In [66]:
inputs = titanic.iloc[:, 1:]
targets = titanic.iloc[:, 0]

In [67]:
inputs.sample(15)

Unnamed: 0,Pclass,Sex,Age,Fare
863,3,2,29.699118,69.55
743,3,1,24.0,16.1
120,2,1,21.0,73.5
227,3,1,20.5,7.25
62,1,1,45.0,83.475
732,2,1,29.699118,0.0
110,1,1,47.0,52.0
855,3,2,18.0,9.35
542,3,2,11.0,31.275
660,1,1,50.0,133.65


In [68]:
targets.sample(15)

459    0
118    0
419    0
106    1
400    1
697    1
821    1
745    0
219    0
372    0
242    0
831    1
82     1
348    1
95     0
Name: Survived, dtype: int64

## Train test split

In [69]:
from sklearn.model_selection import train_test_split

In [70]:
train_x, test_x, train_y, test_y = train_test_split(inputs, targets, test_size=0.2, random_state=42)

In [71]:
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(712, 4)
(712,)
(179, 4)
(179,)


## Decision tree classification using gini index

In [72]:
from sklearn.tree import DecisionTreeClassifier

In [73]:
dt_gini = DecisionTreeClassifier(criterion='gini')

In [74]:
dt_gini.fit(train_x, train_y)

DecisionTreeClassifier()

In [75]:
pred_y = dt_gini.predict(test_x)

In [76]:
pred_y

array([0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1], dtype=int64)

In [77]:
test_y

709    1
439    0
840    0
720    1
39     1
      ..
433    0
773    0
25     1
84     1
10     1
Name: Survived, Length: 179, dtype: int64

### calculation of mean squared error and r2 score

In [78]:
from sklearn.metrics import mean_squared_error, r2_score

In [79]:
mean_squared_error(test_y, pred_y)

0.24022346368715083

In [80]:
r2_score(test_y, pred_y)

0.009395109395109347

In [83]:
dt_gini.score(test_x, test_y)

0.7597765363128491

## Decision tree classification using entropy

In [84]:
dt_entropy = DecisionTreeClassifier(criterion='entropy')

In [85]:
dt_entropy.fit(train_x, train_y)

DecisionTreeClassifier(criterion='entropy')

In [87]:
y_pred = dt_entropy.predict(test_x)

In [88]:
y_pred

array([0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1], dtype=int64)

In [89]:
test_y

709    1
439    0
840    0
720    1
39     1
      ..
433    0
773    0
25     1
84     1
10     1
Name: Survived, Length: 179, dtype: int64

In [90]:
mean_squared_error(test_y, y_pred)

0.24581005586592178

In [91]:
r2_score(test_y, y_pred)

-0.01364221364221363

In [92]:
dt_entropy.score(test_x, test_y)

0.7541899441340782