## Import packages

In [1]:
import pandas as pd
import numpy as np
import os

### Import data

In [2]:
# set path of the data
processed_data_path = os.path.join(os.path.pardir, 'data', 'processed')
train_data_path = os.path.join(processed_data_path, 'train.csv')
test_data_path = os.path.join(processed_data_path, 'test.csv')

In [3]:
train_df = pd.read_csv(train_data_path, index_col = 'PassengerId')
test_df = pd.read_csv(test_data_path, index_col = 'PassengerId')

## Prepare data

In [12]:
train_df.drop('Age_State', axis=1, inplace = True)

In [111]:
train_df.columns

Index(['Survived', 'Age', 'Fare', 'Family_size', 'Is_mother', 'Is_male',
       'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G',
       'Deck_Z', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Title_Lady',
       'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer',
       'Title_Sir', 'Fare_Bin_Very_Low', 'Fare_Bin_Low', 'Fare_Bin_High',
       'Fare_Bin_Very_High', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Is_adult'],
      dtype='object')

**Survived** is the target variable

In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 418 non-null    float64
 1   Fare                418 non-null    float64
 2   Age_State           418 non-null    object 
 3   Family_size         418 non-null    int64  
 4   Is_mother           418 non-null    int64  
 5   Is_male             418 non-null    int64  
 6   Deck_A              418 non-null    int64  
 7   Deck_B              418 non-null    int64  
 8   Deck_C              418 non-null    int64  
 9   Deck_D              418 non-null    int64  
 10  Deck_E              418 non-null    int64  
 11  Deck_F              418 non-null    int64  
 12  Deck_G              418 non-null    int64  
 13  Deck_Z              418 non-null    int64  
 14  Pclass_1            418 non-null    int64  
 15  Pclass_2            418 non-null    int64  
 16  Pclas

We are to predict survival for passengers in test data

In [18]:
# matrix x contains traiing features while array y has the target variable
X = train_df.loc[:, 'Age':].to_numpy().astype('float')
y = train_df.Survived.ravel()

In [19]:
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Shape of X: (891, 31)
Shape of y: (891,)


In [20]:
# split X into train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=.2)

In [22]:
print(f"The training data has {X_train.shape[0]} observations \nThe test data has {X_test.shape[0]} observations")

The training data has 712 observations 
The test data has 179 observations


In [23]:
y_train.shape

(712,)

In [24]:
np.mean(y_train)

0.37640449438202245

In [25]:
np.mean(y_test)

0.4134078212290503

Check out class imbalance

In [26]:
from sklearn.dummy import DummyClassifier

In [27]:
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)

In [28]:
model_dummy.fit(X_train, y_train)

DummyClassifier(random_state=0, strategy='most_frequent')

In [30]:
print(f"The score of the dummy model is {round((model_dummy.score(X_test, y_test)),2)}")

The score of the dummy model is 0.59


The baseline model's accuracy is **59%**

In [32]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

In [39]:
print(f"The accuracy score is {round(accuracy_score(y_test, model_dummy.predict(X_test)), 2)}")

The accuracy score is 0.59


## Kaggle submission preparation

In [41]:
test_df.drop('Age_State', axis=1, inplace = True)

In [42]:
test_X = test_df.to_numpy().astype('float')

In [44]:
# get predictions
predictions = model_dummy.predict(test_X)

In [45]:
submission_df = pd.DataFrame({"PassengerId": test_df.index, "Survived": predictions})

In [46]:
submission_df.head(3)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0


In [47]:
submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
submission_file_path = os.path.join(submission_data_path, '01_dummy.csv')

In [48]:
submission_df.to_csv(submission_file_path, index=False)

In [61]:
def get_submission_file(model, filename):
    test_X = test_df.to_numpy().astype('float')
    # make predictions
    predictions = model.predict(test_X)
    # submission dataframe
    submission_df = pd.DataFrame({"PassengerId": test_df.index, "Survived": predictions})
    # submission file
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)
    # write to the file
    submission_df.to_csv(submission_file_path, index=False)

## Logistic Regression

In [62]:
from sklearn.linear_model import LogisticRegression

In [63]:
model_1 = LogisticRegression(max_iter = 1000)

In [64]:
model_1.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [65]:
 model_1.score(X_test, y_test)

0.8100558659217877

In [66]:
y_pred = model_1.predict(X_test)

In [67]:
print(f"The accuracy of the model is {round(accuracy_score(y_test, model_1.predict(X_test)), 2)}")

The accuracy of the model is 0.81


In [68]:
print(f"The precision is {round(precision_score(y_test, y_pred), 2)}")
print(f"The recall is {round(recall_score(y_test, y_pred))}")

The precision is 0.76
The recall is 1


In [72]:
model_1.coef_

array([[-0.02086975,  0.00334154, -0.58929946,  0.45322337, -1.12228116,
        -0.22858427,  0.30154714, -0.13916748,  0.68197746,  1.16696075,
        -0.03812752, -0.97158619, -0.75197977,  0.16712195,  0.43866723,
        -0.58474906,  0.20151674,  1.26456923,  0.17644021, -1.24086113,
         0.76536434, -0.16556176, -0.98042751, -0.07082388, -0.09639174,
         0.01938581,  0.16886992,  0.24216511,  0.11946429, -0.34058928,
        -0.21691574]])

In [69]:
get_submission_file(model_1,'02_lr.csv')

## Hyperparameter optimization

In [77]:
model_2 = LogisticRegression(random_state=0, max_iter = 1500)

In [78]:
from sklearn.model_selection import GridSearchCV

In [87]:
parameters = {'C':[1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l2']}
clf = GridSearchCV(model_2, param_grid=parameters, cv=3)

In [88]:
clf.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=LogisticRegression(max_iter=1500, random_state=0),
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l2']})

Fix the above error and generate the 3rd kaggle submission

In [89]:
get_submission_file(clf, '3_lr2.csv')

## Feature normalization and standardization

In [91]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#### Feature normalization

In [92]:
scaler = MinMaxScaler()

In [93]:
X_train_scaled = scaler.fit_transform(X_train)

In [94]:
X_train_scaled[:,0].min(), X_train_scaled[:,0].max()

(0.0, 1.0)

In [95]:
#normalize test data
X_test_scaled = scaler.fit_transform(X_test)

#### Feature standardization

In [96]:
scaler = StandardScaler()

In [97]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### Create model after standardization

In [98]:
model_4 = LogisticRegression(random_state = 0, max_iter = 1500)
parameters = {'C':[1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l2']}
clf = GridSearchCV(model_2, param_grid=parameters, cv=3)
clf.fit(X_train_scaled, y_train)

GridSearchCV(cv=3, estimator=LogisticRegression(max_iter=1500, random_state=0),
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l2']})

In [99]:
clf.best_score_

0.8188667872212175

In [100]:
clf.score(X_test_scaled, y_test)

0.7821229050279329

## Model persistence

In [101]:
# import pickle library
import pickle

In [102]:
# create file paths
model_file_path = os.path.join(os.path.pardir, 'models', 'lr_model.pkl')
scaler_file_path = os.path.join(os.path.pardir, 'models', 'lr_scaler.pkl')

In [103]:
# open the files to write
model_file_pickle = open(model_file_path, 'wb')
scaler_file_pickle = open(scaler_file_path, 'wb')

In [104]:
# persist the model and the scaler
pickle.dump(clf, model_file_pickle)
pickle.dump(scaler, scaler_file_pickle)

In [105]:
# close the file
model_file_pickle.close()
scaler_file_pickle.close()

## Load the persisted files

In [107]:
# open file in read mode 
model_file_pickle = open(model_file_path, 'rb')
scaler_file_pickle = open(scaler_file_path, 'rb')
clf_loaded = pickle.load(model_file_pickle)
scaler_loaded = pickle.load(scaler_file_pickle)
# close the files
model_file_pickle.close()
scaler_file_pickle.close()

In [108]:
clf_loaded

GridSearchCV(cv=3, estimator=LogisticRegression(max_iter=1500, random_state=0),
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l2']})

In [109]:
scaler_loaded

StandardScaler()

In [110]:
# transform test data using loaded scaler object
X_test_scaled = scaler_loaded.fit_transform(X_test)
clf_loaded.score(X_test_scaled, y_test)

0.7821229050279329