## Building Predictive Models

In [1]:
import pandas as pd
import os
import numpy as np

### Import Data

In [2]:
# read the data with all default parameters
train_df = pd.read_csv('train_model.csv', index_col='PassengerId')
test_df = pd.read_csv('test_model.csv', index_col='PassengerId')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null float64
Deck_B                891 non-null float64
Deck_C                891 non-null float64
Deck_D                891 non-null float64
Deck_E                891 non-null float64
Deck_F                891 non-null float64
Deck_G                891 non-null float64
Deck_Z                891 non-null float64
Pclass_1              891 non-null float64
Pclass_2              891 non-null float64
Pclass_3              891 non-null float64
Title_Lady            891 non-null float64
Title_Master          891 non-null float64
Title_Miss            891 non-null float64
Title_Mr              891 non-null float64


In [4]:
test_df.head()

Unnamed: 0_level_0,Age,Fare,FamilySize,IsMother,IsMale,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,...,Title_Sir,Fare_Bin_very_low,Fare_Bin_low,Fare_Bin_high,Fare_Bin_very_high,Embarked_C,Embarked_Q,Embarked_S,AgeState_Adult,AgeState_Child
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,34.5,7.8292,1,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
893,47.0,7.0,2,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
894,62.0,9.6875,1,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
895,27.0,8.6625,1,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
896,22.0,12.2875,3,1,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null float64
Deck_B                418 non-null float64
Deck_C                418 non-null float64
Deck_D                418 non-null float64
Deck_E                418 non-null float64
Deck_F                418 non-null float64
Deck_G                418 non-null float64
Deck_Z                418 non-null float64
Pclass_1              418 non-null float64
Pclass_2              418 non-null float64
Pclass_3              418 non-null float64
Title_Lady            418 non-null float64
Title_Master          418 non-null float64
Title_Miss            418 non-null float64
Title_Mr              418 non-null float64
Title_Mrs             418 non-null flo

### Data Preperation

In [14]:
X = train_df.loc[:,'Age':].values.astype('float') # Inputs except survived column

# Converting into matrix with all values in float - NumPy array

y = train_df['Survived'].ravel() # ravel creates one dimensional array  NumPy array

In [15]:
print (X.shape, y.shape)

(891, 32) (891,)


In [26]:
# train test split - For training and Test the model

from sklearn.model_selection import train_test_split

X_train, X_test  = train_test_split(X, test_size=0.2, random_state=0)

y_train, y_test  = train_test_split(y, test_size=0.2, random_state=0)

#20 % of data will be used for model testing and 80 % for training
#random selection of records

print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


#### Check Scikit-Learn Version

In [27]:
import sklearn

In [28]:
sklearn.__version__

'0.19.2'

make sure you have Scikit-Learn v0.19. Else update it and restart kernel. 

In [41]:
#!conda update -y scikit-learn

### Baseline Model

In [29]:
# import function
from sklearn.dummy import DummyClassifier

In [30]:
# create model
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)
#BaseLine model will always output majority class

In [32]:
# train model
model_dummy.fit(X_train, y_train)
#fit function accepts input and output parameters

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [40]:
#Test the model by passing Test data (Input and Output)
#Check the score for test data
#print('score for baseline model : {0:.2f}'.format(model_dummy.score(X_test, y_test)))

print('score for baseline model :', round(model_dummy.score(X_test, y_test),3))

score for baseline model : 0.615


In [41]:
# peformance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [42]:
# accuracy score
print ('accuracy for baseline model : {0:.2f}'.format(accuracy_score(y_test, model_dummy.predict(X_test))))

accuracy for baseline model : 0.61


In [43]:
# confusion matrix
print ('confusion matrix for baseline model: \n {0}'.format(confusion_matrix(y_test, model_dummy.predict(X_test))))

confusion matrix for baseline model: 
 [[110   0]
 [ 69   0]]


In [46]:
# precision and recall scores

print ('precision for baseline model : {0:.2f}'.format(precision_score(y_test, model_dummy.predict(X_test))))
print ('recall for baseline model : {0:.2f}'.format(recall_score(y_test, model_dummy.predict(X_test))))

precision for baseline model : 0.00
recall for baseline model : 0.00


  'precision', 'predicted', average, warn_for)


### Test the model with actual test data

In [47]:
# converting to the matrix
test_X = test_df.values.astype('float')

In [49]:
# get predictions
predictions = model_dummy.predict(test_X)

In [50]:
print(predictions)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0]


In [51]:
#Create a data frame object with Passenger ID and Survived columns
df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived' : predictions} )

In [52]:
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
