In [1]:
#default_exp core

# TITLE

## INFO
By: Kaegan Casey

## TODO:

* try cross_validate with return_estimator=True

## Import Packages

In [2]:
# auto-reload magic
%load_ext autoreload
%autoreload 2

In [3]:
#export
import pandas as pd
pd.options.display.max_rows = 120
pd.options.display.max_columns = 120

import numpy as np

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

# mlflow
import mlflow

# models
from xgboost import XGBClassifier

# helper libraries
from sklearn.model_selection import cross_validate, train_test_split

# custom packages
import sys
sys.path.append('../')
import titanic.preprocessing as prpr
import titanic.training as trn

## Global Variables

### Constants

In [4]:
#export
LOG_MLFLOW = False
EXP_NAME = 'Test_03-14'

TARGET = 'Survived'

STATE = 9

### Parameters

In [5]:
#export
n_estimators = int(sys.argv[1]) if len(sys.argv) > 1 else 100
max_depth = int(sys.argv[2]) if len(sys.argv) > 2 else 6
learning_rate = float(sys.argv[3]) if len(sys.argv) > 3 else 0.3
features = str(sys.argv[4]) if len(sys.argv) > 4 else 'ALL'

## Set Up MLFlow Experiment

In [6]:
#export
if LOG_MLFLOW:
    EX_ID = mlflow.set_experiment(EXP_NAME)

## Data

In [7]:
#export
raw_train = pd.read_csv('../data/1_raw/train.csv')

In [8]:
#export
raw_test = pd.read_csv('../data/1_raw/test.csv')
raw_test.shape

(418, 11)

## Preprocess Data

In [9]:
#export
df_list = [raw_train, raw_test]
drop_features = ['PassengerId', 'Name', 'Ticket', 'Cabin']

proc_df_list = prpr.run(df_list, drop_features)
proc_train = proc_df_list[0]
proc_test = proc_df_list[1]

In [10]:
proc_train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown
0,0,3,22.0,1,0,7.25,0,1,0,0,1,0,0,0,0,0,0,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0,0,0,1,0,0,0,0,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,1,0,0,0,0,0,0,0,0,1
3,1,1,35.0,1,0,53.1,1,0,0,0,1,0,0,1,0,0,0,0,0,0
4,0,3,35.0,0,0,8.05,0,1,0,0,1,0,0,0,0,0,0,0,0,1


In [11]:
proc_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_Unknown
0,3,34.5,0,0,7.8292,0,1,0,1,0,0,0,0,0,0,0,0,1
1,3,47.0,1,0,7.0,1,0,0,0,1,0,0,0,0,0,0,0,1
2,2,62.0,0,0,9.6875,0,1,0,1,0,0,0,0,0,0,0,0,1
3,3,27.0,0,0,8.6625,0,1,0,0,1,0,0,0,0,0,0,0,1
4,3,22.0,1,1,12.2875,1,0,0,0,1,0,0,0,0,0,0,0,1


In [12]:
#export
prpr.save_interim_data(proc_train, 'interim_train')
prpr.save_interim_data(proc_test, 'interim_test')

True

## Split

In [13]:
#export
if isinstance(features, str):
    if features == 'ALL':
        features = proc_train.columns
    else:
        raise ValueError("features param is a string but does not take on the value 'ALL'.")
elif isinstance(features, list):
    features.insert(0, TARGET)
else:
    raise ValueError("features param is not a list or string.")

In [14]:
# drop_me = ['SibSp', 'Embarked_S', 'Fare', 'Deck_E', 'Deck_C', 'Age', 'Embarked_C', 
#  'Parch', 'Embarked_Q', 'Deck_D', 'Deck_A', 'Sex_male', 'Deck_F', 'Deck_G', 'Deck_T']
#drop_me = ['Embarked_C', 'Deck_E', 'Deck_C', 'Deck_B', 'Parch', 'Embarked_Q', 'Deck_D', 'Deck_A', 'Sex_male', 'Deck_F', 'Deck_G', 'Deck_T']
#proc_train.drop(drop_me, axis=1, inplace=True)

In [16]:
#export
proc_train = proc_train[features].copy()

In [17]:
#export
X_train, y_train = trn.seperate_xy(proc_train, TARGET)

In [18]:
X_train.shape, y_train.shape

((891, 2), (891,))

In [19]:
X_train

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
886,0,1
887,1,0
888,1,0
889,0,1


In [20]:
#help(XGBClassifier)

In [21]:
#export
clf = XGBClassifier(n_estimators = n_estimators,
                    use_label_encoder = False,
                    max_depth = max_depth,
                    learning_rate = learning_rate,
                    random_state = STATE, 
                    eval_metric = 'logloss')




val_info = cross_validate(
    clf, 
    X_train, 
    y_train, 
    scoring='accuracy', 
    return_estimator=True
)

score_mean = val_info['test_score'].mean()
score_std = val_info['test_score'].std()
print(f'{score_mean} accuracy with a standard deviation of {score_std}')

0.7867365513778168 accuracy with a standard deviation of 0.018667207932566335


In [22]:
#export
clf = val_info['estimator'][0]

In [23]:
#export
importance_df = pd.DataFrame({'feature':X_train.columns, 'importance': clf.feature_importances_}).sort_values('importance', ascending=False)

In [24]:
#export
importance_df.to_html('../output/data/feature_importance.html', index=False)

In [25]:
#export
if LOG_MLFLOW:
    with mlflow.start_run(experiment_id=EX_ID):
        mlflow.log_param('num_features', X_train.shape[1])
        mlflow.log_param('n_estimators', clf.get_params()['n_estimators'])
        mlflow.log_param('max_depth', clf.get_params()['max_depth'])
        mlflow.log_param('learning_rate', clf.get_params()['learning_rate'])
        mlflow.log_param('booster', clf.get_params()['booster'])
        
        mlflow.log_metric('mean_accuracy', score_mean)
        mlflow.log_metric('std_accuracy', score_std)
        
        mlflow.log_artifact('../output/data/feature_importance.html')