# Predicting Survival in the Titanic Data Set
 using a decision tree to make predictions about the Titanic data
set from Kaggle. This data set provides information on the Titanic
passengers and can be used to predict whether a passenger survived or
not.



## 1) Loading data and modules

In [1]:
#  Import sme required libraries
import numpy as np      # Linear algebra
import pandas as pd     # data Processing

#  Libraries for visualization
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline

# import for mapping the categorical variable
from sklearn.preprocessing import LabelEncoder

# Import scikit_learn module to split the dataset into train.test sub-datasets
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import r2_score

from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, roc_curve, auc

In [2]:
titanic = pd.read_csv("https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv") #read the data

In [3]:
titanic.head() # print  first  5 record 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic.columns #print the columns name from the dataset

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

# 2) Process the data

In [5]:
titanic.isnull().sum() # to find the nan values

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
titanic.Age = titanic.Age.fillna(titanic.Age.mean())     # fill the missing values in Age column with fillna ()

In [7]:
print(titanic.Age.isna().sum()) #check wheather values is missing or not

0


In [8]:
titanic = titanic.drop([ 'PassengerId','Name','Ticket', 'Cabin', 'Embarked'], axis = 1)

In [9]:
titanic.info()    # to check object datatype

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(4), object(1)
memory usage: 48.9+ KB


In [10]:

lb = LabelEncoder()

In [11]:
titanic['Sex'] = lb.fit_transform (titanic['Sex'])

In [12]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int32
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.4 KB


## Split the data¶

In [13]:
X = titanic.drop(['Survived'],axis = 1)
y = titanic['Survived']

In [14]:
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size =0.2)

In [15]:
# Take a look at the shape
X.shape, y.shape

((891, 6), (891,))

# 3)Train the model

#### Decision Tree Classification

In [16]:
model = DecisionTreeClassifier()

In [32]:
model.fit(Xtrain,ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

## 4) Testing the model 

In [33]:
model.score(Xtest,ytest)*100
y_train_pred = model.predict(Xtrain)
y_test_pred = model.predict(Xtest)

#Calculate the accuracy 

train_accuracy = accuracy_score(ytrain, y_train_pred)
test_accuracy = accuracy_score(ytest, y_test_pred)
print('The training accuracy is', train_accuracy)
print('The test accuracy is', test_accuracy)

The training accuracy is 0.9845505617977528
The test accuracy is 0.7653631284916201


## Improving the model

Ok, high training accuracy and a lower testing accuracy. We may be overfitting a bit.

So now it's your turn to shine! Train a new model, and try to specify some parameters in order to improve the testing accuracy, such as:

max_depth
min_samples_leaf
min_samples_split
You can use your intuition, trial and error, or even better, feel free to use Grid Search!

Challenge: Try to get higher accuracy on the testing set. 

In [41]:
# Use grid search to improve this model
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# 1.Create the parameters list
parameters = {'max_depth':[2,4,6,8,10],'min_samples_leaf':[2,4,6,8,10], 'min_samples_split':[2,4,6,8,10]}

# 2.Make an f1_score scoring object.
scorer = make_scorer(f1_score)

# 3.Perform grid search on the classifier using 'scorer' as the scoring method.
grid_obj = GridSearchCV(model, parameters, scoring = scorer)

# 4.Fit the grid search object to the training data and find the optimal parameters.
grid_fit = grid_obj.fit(Xtrain, ytrain)

# 5.Get the estimator.
best_clf = grid_fit.best_estimator_



In [42]:
best_clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [43]:
# TODO: Train the model
best_clf.fit(Xtrain, ytrain)

# TODO: Make predictions
y_train_pred = best_clf.predict(Xtrain)
y_test_pred = best_clf.predict(Xtest)

# TODO: Calculate the accuracy
train_accuracy = accuracy_score(ytrain, y_train_pred)
test_accuracy = accuracy_score(ytest, y_test_pred)
print('The training accuracy is', train_accuracy)
print('The test accuracy is', test_accuracy)

The training accuracy is 0.8890449438202247
The test accuracy is 0.8100558659217877


Accuracy is improve after the hyper tuning parameter