In [4]:
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
import pickle
import flask
import os

In [5]:
trainFile = 'trainProcessed.csv'
testFile = 'testProcessed.csv'

In [6]:
traindf = pd.read_csv (trainFile, index_col = 'PassengerId')
testdf = pd.read_csv (testFile, index_col = 'PassengerId')

In [7]:
traindf.info ()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-null int64
Title_Mrs             891 non-

In [8]:
testdf.info ()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
Age                   418 non-null float64
Fare                  417 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_Lady            418 non-null int64
Title_Master          418 non-null int64
Title_Miss            418 non-null int64
Title_Mr              418 non-null int64
Title_Mrs             418 non-null int64
Title_Officier        418 n

In [9]:
X = traindf.loc[:, 'Age':].as_matrix ().astype ('float')
y = traindf['Survived'].ravel ()

  """Entry point for launching an IPython kernel.


In [10]:
print (X.shape, y.shape)

(891, 32) (891,)


In [11]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size = 0.2, random_state = 0)

In [12]:
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [13]:
print ("Mean survival in train: " + str (np.mean (y_train)))
print ("Mean survival in test: " + str (np.mean (y_test)))

Mean survival in train: 0.38342696629213485
Mean survival in test: 0.3854748603351955


In [14]:
dummyModel = DummyClassifier (strategy = 'most_frequent', random_state = 0)

In [15]:
dummyModel.fit (X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [16]:
print ('Score for base model: ' + str (dummyModel.score (X_test, y_test)))

Score for base model: 0.6145251396648045


In [17]:
print ('Accuracy for base model: ' + str (accuracy_score (y_test, dummyModel.predict (X_test))))

Accuracy for base model: 0.6145251396648045


In [18]:
print ('Confusion matrix for base model: ' + str (confusion_matrix (y_test, dummyModel.predict (X_test))))

Confusion matrix for base model: [[110   0]
 [ 69   0]]


In [19]:
print ('Precision for base model: ' + str (precision_score (y_test, dummyModel.predict (X_test))))
print ('Recall for base model: ' + str (recall_score (y_test, dummyModel.predict (X_test))))

Precision for base model: 0.0
Recall for base model: 0.0


  'precision', 'predicted', average, warn_for)


In [20]:
lrModel1 = LogisticRegression (random_state = 0)

In [21]:
lrModel1.fit (X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
print ('Score for logisitic regression V1: ' + str(lrModel1.score (X_test, y_test)))

Score for logisitic regression V1: 0.8324022346368715


In [23]:
lrModel = LogisticRegression (random_state = 0)

In [24]:
parameters = {'C':[1.0, 10.0, 50.0, 100.0, 1000.0]}
clf = GridSearchCV (lrModel, param_grid = parameters, cv = 3)

In [25]:
clf.fit (X_train, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=0, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [26]:
clf.best_params_

{'C': 1.0}

In [27]:
print ("Best score: " + str(clf.best_score_))

Best score: 0.8286516853932584


In [28]:
scaler = MinMaxScaler ()
X_train_scaled = scaler.fit_transform (X_train)

In [29]:
print (X_train_scaled[:, 0].min (), X_train_scaled[:, 0].max ())

0.0 1.0


In [30]:
X_test_scaled = scaler.transform (X_test)

In [31]:
scaler = StandardScaler ()
X_train_scaled = scaler.fit_transform (X_train)
X_test_scaled = scaler.transform (X_test)

In [32]:
lrModel = LogisticRegression ()
parameters = {'C':[1.0, 10.0, 50.0, 100.0, 1000.0]}
clf = GridSearchCV (lrModel, param_grid = parameters, cv = 3)
clf.fit (X_train_scaled, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [33]:
print ('Score for logisitic regression V2: ' + str (clf.score (X_test_scaled, y_test)))

Score for logisitic regression V2: 0.8435754189944135


In [34]:
modelFile = 'lr_model.pkl'
scalerFile = 'lr_scaler.pkl'

In [35]:
modelPickleFile = open (modelFile, 'wb')
scalerPickleFile = open (scalerFile, 'wb')

In [36]:
pickle.dump (clf, modelPickleFile)
pickle.dump (scaler, scalerPickleFile)

In [37]:
modelPickleFile.close ()
scalerPickleFile.close ()

In [38]:
modelPickleFile = open (modelFile, 'rb')
scalerPickleFile = open (scalerFile, 'rb')

clfLoaded = pickle.load (modelPickleFile)
scalerLoaded = pickle.load (scalerPickleFile)

modelPickleFile.close ()
scalerPickleFile.close ()

In [39]:
clfLoaded

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [40]:
scalerLoaded

StandardScaler(copy=True, with_mean=True, with_std=True)

In [41]:
X_test_scaled = scalerLoaded.transform (X_test)

In [42]:
print ('Score for persisted Logistic regression should be 0.84: ' + str (clf.score (X_test_scaled, y_test)))

Score for persisted Logistic regression should be 0.84: 0.8435754189944135
