In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from IPython.display import display

%matplotlib inline

import random 
random.seed(42)

# Load the Dataset
in_file = 'titanic_data.csv'
full_data = pd.read_csv(f"Datasets/{in_file}")

display(full_data.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
y = full_data['Survived']
X_raw = full_data.drop('Survived', axis=1)

display(X_raw.head())

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Preprocessing the Data

In [4]:
X_raw.select_dtypes(include='object').columns

Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')

In [5]:
X_raw.drop(columns=['Name', 'Fare', 'Cabin', 'Ticket'], inplace=True)

In [6]:
X = pd.get_dummies(X_raw)

In [7]:
X = X.fillna(0.0)
display(X.head())

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,0,1,0,0,1
1,2,1,38.0,1,0,1,0,1,0,0
2,3,3,26.0,0,0,1,0,0,0,1
3,4,1,35.0,1,0,1,0,0,0,1
4,5,3,35.0,0,0,0,1,0,0,1


### Training the Model

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

DecisionTreeClassifier()

### Testing the Model

In [11]:
# Making Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate the Accuracy
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'The Training Accuracy is: {train_accuracy:.2f}')
print(f'The Testing Accuracy is: {test_accuracy:.2f}')

The Training Accuracy is: 1.00
The Testing Accuracy is: 0.74


### Improving the Model

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
# Train the Model
param_grid = {'max_depth':np.arange(3,10), 'min_samples_leaf': np.arange(3,10), 'min_samples_split':np.arange(3, 10)}

# Define the Classifier, and Fit it to the Data
model = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=3)
model.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': array([3, 4, 5, 6, 7, 8, 9]),
                         'min_samples_leaf': array([3, 4, 5, 6, 7, 8, 9]),
                         'min_samples_split': array([3, 4, 5, 6, 7, 8, 9])})

In [14]:
model.best_params_

{'max_depth': 6, 'min_samples_leaf': 6, 'min_samples_split': 4}

In [15]:
model.best_score_

0.8034015293881266

In [16]:
# Making Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [20]:
# Calculate the Accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"The Training Accuracy is : {train_accuracy*100:0.2f} %")
print(f"The Testing Accuracy is : {test_accuracy*100:0.2f} %")

The Training Accuracy is : 85.67 %
The Testing Accuracy is : 83.80 %
