In [1]:
"""An Example of a DNNClassifier for the Iris dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

# Titanic Training and Test data set extraction

In [2]:
titanic_train_data =  pd.read_csv('titanic_train.csv')
titanic_train_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
print(titanic_train_data['Name'].size)
print(len(titanic_train_data['Name'].unique()))

print(titanic_train_data['Sex'].size)
print(len(titanic_train_data['Sex'].unique()))

print(titanic_train_data['Ticket'].size)
print(len(titanic_train_data['Ticket'].unique()))

print(titanic_train_data['Cabin'].size)
print(len(titanic_train_data['Cabin'].unique()))

print(titanic_train_data['Embarked'].size)
print(len(titanic_train_data['Embarked'].unique()))

891
891
891
2
891
681
891
148
891
4


In [4]:
#Remove non-important details from the dataset
titanic_train_data = titanic_train_data.drop(['PassengerId','Name','Ticket'], axis=1)
#titanic_train_data = titanic_train_data.drop(['PassengerId','Name','Sex','Ticket','Cabin','Embarked'], axis=1)
titanic_train_data.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [5]:
print('Size before filliing null values', titanic_train_data.shape)
titanic_train_data = titanic_train_data.fillna(titanic_train_data.mode().iloc[0])
print('Size after filling null values', titanic_train_data.shape)

#Get the features and Label for Train data
all_features =  titanic_train_data.drop(['Survived'], axis=1)
all_labels = titanic_train_data[['Survived']]

train_features, validation_features, train_labels, validation_labels = train_test_split(all_features, all_labels, test_size=0.33 )

print(train_features.head(5))
print(train_labels.head(5))

print(validation_features.head(5))
print(validation_labels.head(5))

print(train_features.shape, train_labels.shape, validation_features.shape, validation_labels.shape)


Size before filliing null values (891, 9)
Size after filling null values (891, 9)
     Pclass     Sex   Age  SibSp  Parch      Fare    Cabin Embarked
556       1  female  48.0      1      0   39.6000      A16        C
694       1    male  60.0      0      0   26.5500  B96 B98        S
458       2  female  50.0      0      0   10.5000  B96 B98        S
730       1  female  29.0      0      0  211.3375       B5        S
504       1  female  16.0      0      0   86.5000      B79        S
     Survived
556         1
694         0
458         1
730         1
504         1
     Pclass     Sex   Age  SibSp  Parch      Fare        Cabin Embarked
498       1  female  25.0      1      2  151.5500      C22 C26        S
285       3    male  33.0      0      0    8.6625      B96 B98        C
438       1    male  64.0      1      4  263.0000  C23 C25 C27        S
436       3  female  21.0      2      2   34.3750      B96 B98        S
492       1    male  55.0      0      0   30.5000          C30    

In [6]:
titanic_test_data =  pd.read_csv('titanic_test.csv')
titanic_test_data.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [7]:
print('Size before filling null values', titanic_test_data.shape)
titanic_test_data = titanic_test_data.fillna(titanic_test_data.mode().iloc[0])
print('Size after filling null values', titanic_test_data.shape)

test_features = titanic_test_data.drop(['PassengerId','Name','Ticket'], axis=1)
#test_features = titanic_test_data.drop(['PassengerId','Name','Sex','Ticket','Cabin','Embarked'], axis=1)

test_features.head(5)

Size before filling null values (418, 11)
Size after filling null values (418, 11)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,34.5,0,0,7.8292,B57 B59 B63 B66,Q
1,3,female,47.0,1,0,7.0,B57 B59 B63 B66,S
2,2,male,62.0,0,0,9.6875,B57 B59 B63 B66,Q
3,3,male,27.0,0,0,8.6625,B57 B59 B63 B66,S
4,3,female,22.0,1,1,12.2875,B57 B59 B63 B66,S


# Create the Neural Networks model and Train

## Added Indicator colums to include categories which are not numerical
#### This improved the results

In [9]:
def input_fn_train(features, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((dict(features),labels))
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
    #dataset.batch(batch_size)
    return dataset

feature_set = []
for feature in train_features.keys():
    if feature in ['Sex','Cabin' ,'Embarked']:
        categoryColumn =  tf.feature_column.categorical_column_with_vocabulary_list(key=feature, vocabulary_list= titanic_train_data[feature].unique())
        indicatorColumn = tf.feature_column.indicator_column(categoryColumn)
        feature_set.append(indicatorColumn)
    else:
        feature_set.append(tf.feature_column.numeric_column(key=feature))
    
#print(feature_set)

classifier = tf.estimator.DNNClassifier(hidden_units=[10, 10], feature_columns=feature_set, n_classes=2)
classifier.train(input_fn= lambda:input_fn_train(train_features, train_labels, 10), steps = 1000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_service': None, '_task_type': 'worker', '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_num_worker_replicas': 1, '_is_chief': True, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_save_summary_steps': 100, '_task_id': 0, '_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fb28b6339e8>, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmpaawc2gza', '_session_config': None, '_tf_random_seed': None, '_num_ps_replicas': 0}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpaawc2gza/model.ckpt.
INFO:tensorflow:loss = 25.86972, step = 1
INFO:tensorflow:global_step/sec: 180.632
INFO:tensorflow:loss = 6.169763, step = 101 (0.558 sec)
INFO:tensorflow:global_step/sec: 242.328
INFO:tensorflow:loss = 7.4599714, step = 201 (0.413 sec)
INFO:tensorflow:global_step/sec: 230.572
INFO:tensorflow:loss = 5

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7fb28b7940b8>

# Evaluation and Prediction

In [10]:
def evaluate_fn_train(features, labels, batch_size):
    if labels is None:
        temp = (dict(features))
    else:
        temp = (dict(features), labels)
    #print(temp)
    dataset = tf.data.Dataset.from_tensor_slices(temp)
    dataset = dataset.batch(batch_size)
    return dataset

evaluation_result = classifier.evaluate(input_fn= lambda:evaluate_fn_train(features=validation_features, labels=validation_labels, batch_size=10))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**evaluation_result))

INFO:tensorflow:Starting evaluation at 2018-03-06-14:11:34
INFO:tensorflow:Restoring parameters from /tmp/tmpaawc2gza/model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2018-03-06-14:11:36
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.8169491, accuracy_baseline = 0.61694914, auc = 0.86190796, auc_precision_recall = 0.8193956, average_loss = 0.43662214, global_step = 1000, label/mean = 0.38305086, loss = 4.293451, prediction/mean = 0.32680994

Test set accuracy: 0.817



In [12]:
predictions = classifier.predict(input_fn= lambda:evaluate_fn_train(features=test_features, labels=None, batch_size=10))

for predict in predictions:
    #predict['class_ids'][0]
    print(predict['class_ids'][0],predict['probabilities'][predict['class_ids'][0]])

INFO:tensorflow:Restoring parameters from /tmp/tmpaawc2gza/model.ckpt-1000
0 0.8886154
1 0.5517419
0 0.87909746
0 0.9006703
1 0.52259415
0 0.8382539
1 0.72432184
0 0.87574214
1 0.6344757
0 0.92474127
0 0.8928198
0 0.75110996
1 0.96240765
0 0.89025044
1 0.87946206
1 0.76899487
0 0.8167035
0 0.9051723
1 0.5696266
1 0.60035074
0 0.73354566
0 0.630848
1 0.919497
0 0.68898904
1 0.99038625
0 0.9347001
1 0.9404273
0 0.9092298
0 0.7078461
0 0.9349901
0 0.87775886
0 0.87665945
0 0.5262738
1 0.5047946
0 0.76304257
0 0.8901041
1 0.66997236
1 0.6727608
0 0.8979363
0 0.80981123
0 0.93171835
0 0.68980575
0 0.9170473
1 0.8045397
1 0.9535297
0 0.89995
0 0.7463152
0 0.8731879
1 0.9605033
1 0.5435662
0 0.6403101
0 0.84920067
1 0.64180255
1 0.97377366
0 0.8409676
0 0.8979747
0 0.9097667
0 0.89617324
0 0.9100276
1 0.99325
0 0.8649624
0 0.84226626
0 0.8726326
1 0.7302977
1 0.75460386
1 0.8360017
1 0.73516685
0 0.6838579
0 0.7279043
1 0.99122393
1 0.728259
0 0.8928198
1 0.66140264
0 0.72561836
1 0.9922666
1