## Building Predictive Model

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
#set path of processed data
processed_data_path = os.path.join(os.path.pardir,'data','processed')
train_file_path = os.path.join(processed_data_path,'train.csv')
test_file_path = os.path.join(processed_data_path,'test.csv')

In [3]:
train_df = pd.read_csv(train_file_path,index_col='PassengerId')
test_df = pd.read_csv(test_file_path,index_col='PassengerId')

In [4]:
train_df.info()
#train_df.Survived.value_counts()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 44 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_capt            891 non-null int64
Title_col             891 non-null int64
Title_don             891 non-null int64
Title_dona            891 non-null int64
Title_dr              891 non-

In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 43 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_capt            418 non-null int64
Title_col             418 non-null int64
Title_don             418 non-null int64
Title_dona            418 non-null int64
Title_dr              418 non-null int64
Title_jonkheer        418 n

## Data Preparation

In [6]:
X = train_df.loc[:,'Age':].as_matrix().astype('float')
y = train_df['Survived'].ravel() #ravel will create one dimension array

  """Entry point for launching an IPython kernel.


In [7]:
print X.shape, y.shape # as good practice use capital leter for matrix and small leter for one dimention array

(891L, 43L) (891L,)


In [8]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape

(712L, 43L) (712L,)
(179L, 43L) (179L,)


In [9]:
#avg survival in train and test 
print 'Mean survival in train : {0:.3f}'.format(np.mean(y_train))
print 'Mean survival in train : {0:.3f}'.format(np.mean(y_test))
#positive cases should evenly distributed in train and test data
#we have only 39% of positive cases, we have some imbalance between +ve & -ve calss 

Mean survival in train : 0.383
Mean survival in train : 0.385


### Check Scikit-Learn Version

In [10]:
import sklearn

In [11]:
sklearn.__version__

'0.20.3'

## Baseline Model

In [12]:
#import function
from sklearn.dummy import DummyClassifier

In [13]:
#create model
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)

In [14]:
#train model
model_dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [15]:
print 'score for baseline model : {0:.2f}'.format(model_dummy.score(X_test, y_test))

score for baseline model : 0.61


In [16]:
np.unique(y_test,return_counts=True)
#110.0/(110+69)

(array([0, 1], dtype=int64), array([110,  69], dtype=int64))

In [17]:
#performance metrics 
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [18]:
#accuracy score
print 'Accuracy score for baseline model : {0:.2f}'.format(accuracy_score(y_test, model_dummy.predict(X_test)))

Accuracy score for baseline model : 0.61


In [19]:
#confusion matrix
print 'Confusion matrix for baseline model : \n {0}'.format(confusion_matrix(y_test, model_dummy.predict(X_test)))

Confusion matrix for baseline model : 
 [[110   0]
 [ 69   0]]


In [20]:
#precision & recall score
print 'Precision score for baseline model : {0:.2f}'.format(precision_score(y_test, model_dummy.predict(X_test)))
print 'Recall score for baseline model : {0:.2f}'.format(recall_score(y_test, model_dummy.predict(X_test)))

Precision score for baseline model : 0.00
Recall score for baseline model : 0.00


  'precision', 'predicted', average, warn_for)


## First Kaggle Submission

In [21]:
#converting to matrix
#this is actual test data
test_X = test_df.as_matrix().astype('float')

  This is separate from the ipykernel package so we can avoid doing imports until


In [22]:
#get preditions for actual test data 
predictions = model_dummy.predict(test_X)

In [23]:
#np.unique(predictions) #we got all values zeros, not sure if it is correct
np.unique(model_dummy.predict(X_test)) #here also we got all zeros X_test is 20% test data from 
#guess its expected as baseline model alway takes high probability value, which is zero, in our data we have more zeros then 1

array([0], dtype=int64)

In [24]:
df_sumbission = pd.DataFrame({'PassengerId':test_df.index, 'Survived' : predictions})

In [25]:
df_sumbission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [26]:
submission_data_path = os.path.join(os.path.pardir,'data','external')
submission_file_path = os.path.join(submission_data_path,'01_dummy.csv')

In [27]:
df_sumbission.to_csv(submission_file_path, index=False)

In [28]:
def get_submission_file(model,filename):
    #converting to matrix
    #this is actual test data
    test_X = test_df.as_matrix().astype('float')
    #get preditions for actual test data 
    predictions = model.predict(test_X)
    df_sumbission = pd.DataFrame({'PassengerId':test_df.index, 'Survived' : predictions})
    #sumbision file
    submission_data_path = os.path.join(os.path.pardir,'data','external')
    submission_file_path = os.path.join(submission_data_path,filename)
    #writing to file
    df_sumbission.to_csv(submission_file_path, index=False)

In [29]:
get_submission_file(model_dummy,'01_dummy.csv')

  after removing the cwd from sys.path.


## Logistic Regression Model

In [30]:
#import function
from sklearn.linear_model import LogisticRegression

In [31]:
#create model
model_lr_1 = LogisticRegression(random_state=0)

In [32]:
#train model
model_lr_1.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [33]:
#evaluate model
print 'score for logistic regression - version 1 : {0:.2f}'.format(model_lr_1.score(X_test,y_test))

score for logistic regression - version 1 : 0.83


In [34]:
#perfromance matrix
#accuracy
print 'Accuary Score for logistic regression - version 1 : {0:.2f}'.format(accuracy_score(y_test,model_lr_1.predict(X_test)))
#confusion matrix
print 'Confusion Matrix for logistic regression - version 1 : \n {0}'.format(confusion_matrix(y_test,model_lr_1.predict(X_test)))
#precision
print 'Precision Score for logistic regression - version 1 : {0:.2f}'.format(precision_score(y_test,model_lr_1.predict(X_test)))
#recall
print 'Recall Score for logistic regression - version 1 : {0:.2f}'.format(recall_score(y_test,model_lr_1.predict(X_test)))

Accuary Score for logistic regression - version 1 : 0.83
Confusion Matrix for logistic regression - version 1 : 
 [[94 16]
 [15 54]]
Precision Score for logistic regression - version 1 : 0.77
Recall Score for logistic regression - version 1 : 0.78


In [35]:
#model cofficients
#model weights or model parameters 
model_lr_1.coef_

array([[-0.02849415,  0.00459629, -0.49964282,  0.61878442, -0.93433872,
         0.07399421, -0.14986016, -0.37629548,  0.53485454,  1.11247042,
         0.4143235 , -0.175893  , -0.27778581,  0.98561851,  0.50525411,
        -0.3350644 , -0.20473023,  0.2728421 ,  0.        ,  0.        ,
        -0.23038769,  0.        ,  0.17552238, -0.01556939,  1.20222128,
         0.41640948,  0.15942102,  0.07306854, -1.46980823,  0.92465925,
         0.22870026, -0.79632829,  0.30742171,  0.112366  ,  0.17464401,
         0.27015997,  0.29095487,  0.42004936,  0.50108526,  0.48779117,
         0.16693178,  0.40261812,  0.75319009]])

### Second Kaggle submition

In [36]:
get_submission_file(model_lr_1,'02_lr.csv')

  after removing the cwd from sys.path.


### Let try using whole train data insted of 80-20

In [37]:
#splip train data 
#train_df.info()
#test_df.info()
#print 'score for logistic regression - version 1 : {0:.2f}'.format(model_lr_1.score(X_train,y_train))
#print 'score for logistic regression - version 1 : {0:.2f}'.format(model_lr_1.score(X_test,y_test))
train_ws_df = train_df.loc[:,'Age':]
train_os_df = train_df.loc[:,'Survived']

In [38]:
ar_train_ws_df = train_ws_df.as_matrix().astype('float')
ar_train_os_df = train_os_df.ravel()

  """Entry point for launching an IPython kernel.


In [39]:
print ar_train_ws_df.shape, ar_train_os_df.shape

(891L, 43L) (891L,)


In [40]:
ar_train_ws_df

array([[22.    ,  7.25  ,  2.    , ...,  1.    ,  1.    ,  0.    ],
       [38.    , 71.2833,  2.    , ...,  0.    ,  1.    ,  0.    ],
       [26.    ,  7.925 ,  1.    , ...,  1.    ,  1.    ,  0.    ],
       ...,
       [22.    , 23.45  ,  4.    , ...,  1.    ,  1.    ,  0.    ],
       [26.    , 30.    ,  1.    , ...,  0.    ,  1.    ,  0.    ],
       [32.    ,  7.75  ,  1.    , ...,  0.    ,  1.    ,  0.    ]])

In [41]:
model_lr_1_1 = LogisticRegression(random_state=0)
model_lr_1_1.fit(ar_train_ws_df,ar_train_os_df)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [42]:
print 'Score : {0}'.format(model_lr_1_1.score(ar_train_ws_df,ar_train_os_df))

Score : 0.840628507295


In [43]:
print 'Acuracy : {0}'.format(accuracy_score(ar_train_os_df,model_lr_1_1.predict(ar_train_ws_df)))
print 'Confusion Matrix : \n {0}'.format(confusion_matrix(ar_train_os_df,model_lr_1_1.predict(ar_train_ws_df)))
print 'Presicion : {0}'.format(precision_score(ar_train_os_df,model_lr_1_1.predict(ar_train_ws_df)))
print 'Recall : {0}'.format(recall_score(ar_train_os_df,model_lr_1_1.predict(ar_train_ws_df)))

Acuracy : 0.840628507295
Confusion Matrix : 
 [[485  64]
 [ 78 264]]
Presicion : 0.80487804878
Recall : 0.771929824561


In [44]:
model_lr_1_1.coef_

array([[-0.0269891 ,  0.00387927, -0.52346826,  0.2491599 , -1.255066  ,
         0.05122185,  0.2671486 , -0.10706347,  0.75401299,  1.01532903,
         0.22992585, -0.5458219 , -0.49139716,  0.73951017,  0.64941032,
        -0.21556471, -0.22178155,  0.05959299, -0.41276555,  0.        ,
        -0.19734359, -0.25111728,  0.16675634, -0.01086003,  1.54202324,
         0.4601965 ,  0.1153404 ,  0.04828753, -1.05355673,  1.14506143,
         0.26551431, -0.91358635,  0.35175708,  0.07983706, -0.01170116,
         0.06867141,  0.44123347,  0.67515206,  0.55219064,  0.46546789,
         0.15569725,  0.41749172,  0.75586406]])

In [45]:
get_submission_file(model_lr_1_1,'03_lr.csv')

  after removing the cwd from sys.path.


In [46]:
train_df.head(20)

Unnamed: 0_level_0,Survived,Age,Fare,FamilySize,IsMother,IsMale,Deck_A,Deck_B,Deck_C,Deck_D,...,Title_the countess,Fare_Bin_very_low,Fare_Bin_low,Fare_Bin_high,Fare_Bin_very_hig,Embarked_C,Embarked_Q,Embarked_S,AgeState_Adult,AgeState_Child
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,22.0,7.25,2,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,1,0
2,1,38.0,71.2833,2,0,0,0,0,1,0,...,0,0,0,0,1,1,0,0,1,0
3,1,26.0,7.925,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
4,1,35.0,53.1,2,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,1,0
5,0,35.0,8.05,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
6,0,29.0,8.4583,1,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0
7,0,54.0,51.8625,1,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
8,0,2.0,21.075,5,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
9,1,27.0,11.1333,3,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
10,1,14.0,30.0708,2,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1


## Part2

### Hyperparameter Optimization

In [47]:
#base model
model_lr = LogisticRegression(random_state=0)

In [48]:
from sklearn.model_selection import GridSearchCV

In [49]:
parameters = {'C':[0.5,0.8,1.0,10.0,50.0,100.0,1000.0], 'penalty':['l1','l2']}
clf = GridSearchCV(model_lr,parameters,cv=3)

In [50]:
clf

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.5, 0.8, 1.0, 10.0, 50.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [51]:
clf.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.5, 0.8, 1.0, 10.0, 50.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [52]:
clf.best_params_

{'C': 0.8, 'penalty': 'l2'}

In [53]:
print 'best score : {0}'.format(clf.best_score_)

best score : 0.834269662921


In [54]:
#evaluate mode
print 'score for logistic regression v2 : {0}'.format(clf.score(X_test,y_test))

score for logistic regression v2 : 0.826815642458


### Making third submission

In [55]:
# get submission file
get_submission_file(clf,'04_lr.csv')

  after removing the cwd from sys.path.


## Feature Normalization and Standardization

In [56]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#### Feature Normalization

In [57]:
# Feature Normalization
scalar = MinMaxScaler()
X_train_scaled = scalar.fit_transform(X_train)

In [58]:
X_train_scaled[:,0].min(), X_train_scaled[:,0].max()

(0.0, 1.0)

In [59]:
#normaliz test data
X_test_scaled = scalar.transform(X_test)

#### Feature Standardization

In [60]:
# Feature standardization
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)
#X_train_scaled = scalar.fit_transform(X_train_scaled)
#X_test_scaled = scalar.transform(X_test_scaled)

#### Create model after Standardization 

In [61]:
model_lr = LogisticRegression()
parameters = {'C':[0.5,0.8,1.0,10.0,50.0,100.0,1000.0], 'penalty':['l1','l2']}
clf = GridSearchCV(model_lr,parameters,cv=3)
clf.fit(X_train_scaled,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.5, 0.8, 1.0, 10.0, 50.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [62]:
clf.best_score_, clf.best_params_

(0.8230337078651685, {'C': 0.5, 'penalty': 'l1'})

In [63]:
print 'Score for logistic regression : {0}'.format(clf.score(X_test_scaled, y_test))

Score for logistic regression : 0.826815642458


In [64]:
get_submission_file(clf,'05_lr.csv')

  after removing the cwd from sys.path.


## Model Persistence

In [65]:
#import pickle library
import pickle 

In [66]:
#create file paths 
model_file_path = os.path.join(os.path.pardir,'models','lr_model.pki')
scaler_file_path = os.path.join(os.path.pardir,'models','lr_scaler.pki')

In [67]:
#open the file to write
model_file_pickle = open(model_file_path,'wb')
scaler_file_pickle = open(scaler_file_path,'wb')

In [68]:
#persist the model and scaler
pickle.dump(clf,model_file_pickle)
pickle.dump(scalar,scaler_file_pickle)

In [69]:
model_file_pickle.close()
scaler_file_pickle.close()

### load the persisted file

In [70]:
# open file in read mode 
model_file_pickle = open(model_file_path,'r')
scaler_file_pickle = open(scaler_file_path,'r')
# load files 
clf_loaded = pickle.load(model_file_pickle)
scaler_loaded = pickle.load(scaler_file_pickle)
# close files
model_file_pickle.close()
scaler_file_pickle.close()

In [71]:
clf_loaded

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.5, 0.8, 1.0, 10.0, 50.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [72]:
scaler_loaded

StandardScaler(copy=True, with_mean=True, with_std=True)

In [73]:
# transform the test data using loaded scaler object 
X_test_scaled = scaler_loaded.transform(X_test)
# calculate the score using the loaded model object 
print 'score for persisted logistic regression : {0}'.format(clf_loaded.score(X_test_scaled, y_test))

score for persisted logistic regression : 0.826815642458
