Decision Tree on Titanic DataSet

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

  import pandas.util.testing as tm


In [None]:
url= "https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv"
titanic = pd.read_csv(url)

In [None]:
titanic.columns = ['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']

In [None]:
titanic.drop(['PassengerId','Name','Ticket','Cabin','Embarked'],axis=1, inplace=True)

In [None]:
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


**Data PreProcessing**

In [None]:
titanic.shape

(891, 7)

In [None]:
titanic.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
dtype: object

In [None]:
titanic.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
titanic.describe(include=['O'])

Unnamed: 0,Sex
count,891
unique,2
top,male
freq,577


**EDA**

In [None]:
#Checking for missing values
titanic.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
dtype: int64

In [None]:
#Age has 177 missing values
titanic['Age'].fillna(titanic['Age'].median(skipna='True'),inplace = True)

In [None]:
titanic.isna().sum()
#Now we can see that values of age has been replaced with medan values

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [None]:
sex_encode = pd.get_dummies(titanic['Sex'], drop_first=True)

In [None]:
titanic = pd.concat((titanic,sex_encode),axis=1)

In [None]:
X = titanic.drop(['Survived','Sex'], axis=1)
y = titanic['Survived']

In [None]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,male
0,3,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,0
2,3,26.0,0,0,7.925,0
3,1,35.0,1,0,53.1,0
4,3,35.0,0,0,8.05,1


__Since Parameters to model the data is already provided I did not do any feature selection and as it is decision tree based I did not perform any data transformation, scaling etc__

### Hyperparameter tuning

In [None]:
dt = DecisionTreeClassifier()

In [None]:
params ={'criterion':["gini", "entropy"],
        'splitter' : ["best", "random"],
         'max_depth': range(1,20,2),
         'min_samples_split': range(2,10,1)
        }

In [None]:
rs = RandomizedSearchCV(estimator=dt, param_distributions=params, cv=5, n_jobs =-1)

In [None]:
random_results = rs.fit(X,y)

In [None]:
random_results.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=9, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

### Using parameter from randomised search for model training

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [None]:
X_train.shape

(712, 6)

In [None]:
X_test.shape

(179, 6)

In [None]:
dtc = random_results.best_estimator_.fit(X_train,y_train)

### Training Accuracy

In [None]:
y_train_pred = dtc.predict(X_train)

In [None]:
print(confusion_matrix(y_train,y_train_pred))

[[426  15]
 [ 61 210]]


In [None]:
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92       441
           1       0.93      0.77      0.85       271

    accuracy                           0.89       712
   macro avg       0.90      0.87      0.88       712
weighted avg       0.90      0.89      0.89       712



### Testing Accuracy

In [None]:
y_test_pred = dtc.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_test_pred))

[[95 13]
 [26 45]]


In [None]:
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.79      0.88      0.83       108
           1       0.78      0.63      0.70        71

    accuracy                           0.78       179
   macro avg       0.78      0.76      0.76       179
weighted avg       0.78      0.78      0.78       179

