In [None]:
import numpy as np
import sys
import yaml
import os
import datetime
import pandas as pd
from shutil import copyfile
from sklearn.model_selection import StratifiedKFold, KFold
from utils import generate_model_config, generate_gridsearch_configs, train_model, get_metrics

sys.path.insert(0,'/anaconda3/envs/model_search/lib/python3.7/site-packages') #This is specific to my environment in order to import xgboost, proably not needed in others
import xgboost

### Import data

I'm using a breast cancer dataset that comes pre-loaded with sklearn. To use your own dataset, replace these cells with code that loads your dataset (assuming that the test set has already been split out), and arranges it in to your features matrix X (rows are observations, columns are features), and labels/ground truth matrix y.

In [None]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [None]:
X = data['data']
y = data['target']
print(f'shape of features matrix: {X.shape}')
print(f'shape of labels matrix: {y.shape}')

### Import configuration yaml

This imports the configuration that defines what models will be trained and what metrics will be recorded. 

In [None]:
model_type = 'classification' #classification or regression

with open(f"config_{model_type}.yaml", 'r') as stream:
    config_experiment = yaml.load(stream)
    config_experiment['model_params'] = config_experiment['model_search']
    config_experiment['model_type'] = model_type

### Setup results folder

Make a folder (with the current datetime as a name) to store results in. So that results aren't commited, the 'results' folder should be added to the .gitignore file

In [None]:
experiment_folder = os.path.join(f'../results/{model_type}', datetime.datetime.now().strftime('%Y%m%d_%H%M'))
if not os.path.exists(experiment_folder):
    print(f'Saving experiment output to {experiment_folder}')
    os.makedirs(experiment_folder)
else: 
    print(f'Folder {experiment_folder} already exists - overwriting contents.')
    
copyfile(f"./config_{model_type}.yaml", experiment_folder+f"/config_{model_type}.yaml") #copy yaml used to setup experiment
copyfile('./grid_search_classification.ipynb', experiment_folder+"/grid_search_classification.ipynb")

### Grid search

Setup k-fold 

In [None]:
if model_type == 'classification':
    skf = StratifiedKFold(n_splits=config_experiment['cross_validation']['num_folds'])
elif model_type == 'regression':
    skf = KFold(n_splits=config_experiment['cross_validation']['num_folds'])

Run grid search

In [None]:
grid_config = generate_model_config(config_experiment)
index=0
model_num=0
results_df = pd.DataFrame()

for class_path, parameters in generate_gridsearch_configs(grid_config):
    print(f"{index}. Training {class_path}")
    fold = 0
    for train_index, test_index in skf.split(X, y):
        # setup log
        log = dict()
        log['model_num'] = model_num
        log['fold'] = fold
        
        # setup data
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
          
        # train model
        trained_model, log = train_model(X_train, y_train, class_path, parameters, log)

        # predict using trained model
        # train set
        log = get_metrics(X_train, y_train, trained_model, log, pred_type='train', config = config_experiment)
        # validation set
        log = get_metrics(X_test, y_test, trained_model, log, pred_type='val', config = config_experiment)
          
        # log results
        tmp = pd.DataFrame(log, columns=log.keys(), index=[index])
        results_df = results_df.append(tmp, sort=False)
        results_df.to_pickle(f"{experiment_folder}/results_df.pkl")
        fold+=1
        index+=1
        
    model_num += 1