# Building predictive models

In [1]:
#libraries
import pandas as pd
import numpy as np
import os

In [2]:
processed_path = os.path.join(os.path.pardir,'data','processed')
train_path = os.path.join(processed_path,'train.csv')
test_path = os.path.join(processed_path,'test.csv')

In [3]:
train_df = pd.read_csv(train_path, index_col='PassengerId')
test_df = pd.read_csv(test_path, index_col='PassengerId')

In [4]:
train_df.shape

(891, 32)

In [48]:
train_df.columns

Index(['Survived', 'Age', 'Fare', 'FamilySize', 'isMom', 'Deck_A', 'Deck_B',
       'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_Z', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Title_Lady', 'Title_Master', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Sir',
       'Fare_Bin_very_low', 'Fare_Bin_low', 'Fare_Bin_high',
       'Fare_Bin_very_high', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'AgeState_Adult', 'AgeState_Child'],
      dtype='object')

In [5]:
test_df.shape

(418, 31)

# Data Preparation

In [6]:
X = train_df.loc[:,'Age':].to_numpy().astype('float')
y = train_df['Survived'].ravel()

In [7]:
print(X.shape, y.shape)

(891, 31) (891,)


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(712, 31) (712,)
(179, 31) (179,)


In [9]:
print('avg survival in train: ' + str(np.mean(y_train)))
print('avg survival in test: ' + str(np.mean(y_test)))

avg survival in train: 0.38342696629213485
avg survival in test: 0.3854748603351955


import sklearn

In [10]:
import sklearn
sklearn.__version__

'0.24.2'

## Baseline Model

In [11]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score,recall_score

In [12]:
model_dummy = DummyClassifier(strategy='most_frequent',random_state=0)

In [13]:
model_dummy.fit(X_train,y_train)

DummyClassifier(random_state=0, strategy='most_frequent')

In [14]:
print('Evaluation: ' + str(model_dummy.score(X_test,y_test)))

Evaluation: 0.6145251396648045


In [15]:
print('Confusion: ')
print(str(confusion_matrix(y_test,model_dummy.predict(X_test))))

Confusion: 
[[110   0]
 [ 69   0]]


In [16]:
print('precision: '+ str(precision_score(y_test,model_dummy.predict(X_test))))
print('recall: '+ str(recall_score(y_test,model_dummy.predict(X_test))))

precision: 0.0
recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


## Logistic regression model

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
model_lr_1 = LogisticRegression(random_state=0)

In [19]:
model_lr_1.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=0)

In [20]:
print('score: ' + str(model_lr_1.score(X_test,y_test)))

score: 0.8324022346368715


In [21]:
print('precision: '+ str(precision_score(y_test,model_lr_1.predict(X_test))))
print('recall: '+ str(recall_score(y_test,model_lr_1.predict(X_test))))

precision: 0.782608695652174
recall: 0.782608695652174


In [22]:
print('Confusion: ')
print(str(confusion_matrix(y_test,model_lr_1.predict(X_test))))

Confusion: 
[[95 15]
 [15 54]]


## Hyperparameter optimization

In [23]:
model_lr = LogisticRegression(random_state=0)

In [24]:
from sklearn.model_selection import GridSearchCV

In [25]:
parameters = {'C':[1.0,10.0,50.0,100.0,1000.0],'penalty':['l1','l2']}
clf = GridSearchCV(model_lr,param_grid=parameters,cv=3)

In [26]:
clf.fit(X_train,y_train)

Traceback (most recent call last):
  File "C:\Users\halil.taylan\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\halil.taylan\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\halil.taylan\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\halil.taylan\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\halil.taylan\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "C:\Users\halil.taylan\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\halil.taylan\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\halil.taylan\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l

GridSearchCV(cv=3, estimator=LogisticRegression(random_state=0),
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l1', 'l2']})

In [27]:
clf.best_params_

{'C': 1.0, 'penalty': 'l2'}

In [28]:
clf.best_score_

0.8272819676393764

## Normalization & Standartization

In [29]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [30]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [31]:
X_train_scaled[:,0].min(),X_train_scaled[:,0].max()

(0.0, 1.0)

In [32]:
X_train_scaled

array([[0.43274928, 0.02975782, 0.2       , ..., 0.        , 1.        ,
        0.        ],
       [0.38232699, 0.02049464, 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.38232699, 0.07222739, 0.2       , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.35711585, 0.0150944 , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.44535485, 0.03396254, 0.1       , ..., 1.        , 1.        ,
        0.        ],
       [0.74788857, 0.07612293, 0.2       , ..., 1.        , 1.        ,
        0.        ]])

In [33]:
X_test_scaled = scaler.transform(X_test)

In [34]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Create model after standardization

In [35]:
model_lr = LogisticRegression()
parameters = {'C':[1.0,10.0,50.0,100.0,1000.0],'penalty':['l1','l2']}
clf = GridSearchCV(model_lr,param_grid=parameters, cv=3)
clf.fit(X_train_scaled, y_train)

Traceback (most recent call last):
  File "C:\Users\halil.taylan\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\halil.taylan\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\halil.taylan\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\halil.taylan\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\halil.taylan\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver

GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l1', 'l2']})

In [36]:
clf.best_score_

0.8089978607476747

## Model persistence

In [37]:
import pickle

In [38]:
model_file_path = os.path.join(os.path.pardir,'Models','lr_model.pkl')
scaler_file_path = os.path.join(os.path.pardir,'Models','lr_scaler.pkl')

In [39]:
model_file_pickle = open(model_file_path,'wb')
scaler_file_pickle = open(scaler_file_path,'wb')
pickle.dump(clf,model_file_pickle)
pickle.dump(scaler,scaler_file_pickle)
model_file_pickle.close()
scaler_file_pickle.close()

In [40]:
model_file_pickle = open(model_file_path,'rb')
scaler_file_pickle = open(scaler_file_path,'rb')

In [41]:
clf_loaded = pickle.load(model_file_pickle)
scaler_loaded = pickle.load(scaler_file_pickle)
model_file_pickle.close()
scaler_file_pickle.close()

In [42]:
clf_loaded

GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l1', 'l2']})

In [43]:
scaler_loaded

StandardScaler()

In [44]:
X_test_scaled = scaler_loaded.transform(X_test)

In [45]:
clf_loaded.score(X_test_scaled,y_test)

0.8435754189944135