## Usage examples AWS

In [None]:
%pylab inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import sklearn
import xgboost
from sklearn import metrics
from collections import Counter
from time import gmtime, strftime    
from sklearn import metrics

import tarfile
import pickle as pkl

import os
import boto3
import re
import sagemaker
from sagemaker import get_execution_role
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.amazon.amazon_estimator import get_image_uri

## AWS client setup

In [None]:
%%time

role = get_execution_role()
region = boto3.Session().region_name

container = get_image_uri(region, 'xgboost')
smclient = boto3.Session().client('sagemaker')

bucket = 'bucket_name'
bucket_path = 'http://bucket_path.s3.amazonaws.com/'
prefix = 'prefix_to_directory'

## File upload

In [None]:
local_location = <local_path>
time =  strftime("%d-%H-%M-%S", gmtime())
location = 's3://{}/{}/{}'.format(bucket, prefix, time)

if not os.path.exists(local_location): 
    os.makedirs(local_location)
    

train.to_csv(local_location + '/train.csv', header=False, index=False)
valid.to_csv(local_location + '/valid.csv', header=False, index=False)
test.to_csv(local_location + '/test.csv', header=False, index=False)

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, time, 'train/train.csv')).upload_file(local_location+'/train.csv') 
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, time, 'test/test.csv')).upload_file(local_location+'/test.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, time, 'valid/valid.csv')).upload_file(local_location+'/valid.csv')

train_data_location = '{}/{}'.format(location, 'train')
test_data_location = '{}/{}'.format(location,'test')
valid_data_location = '{}/{}'.format(location, 'valid')
s3_output_location = location

s3_input_train = sagemaker.session.s3_input(train_data_location, content_type='text/csv')
s3_input_test = sagemaker.session.s3_input(test_data_location, content_type='text/csv')
s3_input_valid = sagemaker.session.s3_input(valid_data_location, content_type='text/csv')

data_channels = {'train': s3_input_train, 'validation': s3_input_valid}

## Simple Model

In [None]:
def create_xgboost_obj(data_channels, location, diag, scale_pos_weight, diag_perc, alpha, colsample_bytree, max_depth, min_child_weight, subsample, num_round):
    
    job_name = "{}-{}-{}".format(diag[:10], strftime("%d-%H-%M-%S", gmtime()), scale_pos_weight).replace('_', '-').replace('.', '-')
    xgb_model = sagemaker.estimator.Estimator(
                                    container,
                                     role, 
                                     train_instance_count=1,#5 
                                     train_instance_type='ml.m4.xlarge',  ## remotee
                                     train_volume_size = 5, 
                                     output_path=location,
                                    train_use_spot_instances=True,
                                    train_max_wait=3600,
                                    train_max_run = 3000, 
                                     sagemaker_session=sagemaker.Session())
    
    xgb_model.set_hyperparameters(
                            alpha=alpha,
                            colsample_bytree = colsample_bytree, 
                            max_depth=max_depth, 
                            eta=0.2,
                            gamma=4,
                            min_child_weight=min_child_weight,
                            subsample=subsample,
                            silent=0,
                            objective='binary:logistic',
                            scale_pos_weight = scale_pos_weight,
                            num_round=num_round,
                            eval_metric='auc')
    
    xgb_model.fit(data_channels, job_name=job_name)

## HyperparameterTuner Example

In [None]:
def train_model(data_channels, location, scale_pos_weight=1):
    
    xgb_model = sagemaker.estimator.Estimator(
                                    container,
                                     role, 
                                     train_instance_count=1, 
                                     train_instance_type='ml.m4.4xlarge',#'ml.m4.xlarge', ## remote
                                     train_volume_size = 5,
                                     output_path=location,
                                     sagemaker_session=sagemaker.Session())
    
    xgb_model.set_hyperparameters(
                            #max_depth=2,
                            eta=0.2,
                            gamma=4,
                            #min_child_weight=6,
                            #subsample=0.8,
                            silent=0,
                            objective='binary:logistic',
                            scale_pos_weight = scale_pos_weight,
                            #num_round=100,
                            eval_metric='auc')

    objective_metric_name = 'validation:auc'

    hyperparameter_ranges = {
    'lambda': ContinuousParameter(0.01, 10),
    'max_depth':IntegerParameter(3, 9),
    'colsample_bytree':ContinuousParameter(0.01, 0.5),
    'alpha':ContinuousParameter(0.01, 0.1),
    'num_round': IntegerParameter(50, 2000),
    'subsample':ContinuousParameter(0.6, 1),
    'min_child_weight':ContinuousParameter(10, 100)
    }
    
    tuner = HyperparameterTuner(
        xgb_model,
        objective_metric_name,
        hyperparameter_ranges,
        max_jobs=100,#20
        max_parallel_jobs=3,#10
        strategy='Random'
    )
    
    job_name = "{}".format(strftime("%d-%H-%M-%S", gmtime())).replace('_', '-')
    
    tuner.fit(data_channels, include_cls_metadata=False, job_name=job_name, wait=True)

    return xgb_model

## Some utils functions

In [None]:
def upload_json_dump(bucket, dir_path, collection, timestamp, incremental_id, json_dump, log=None):
    try:

        create_bucket_directory(bucket, dir_path)
        log.info('processed {} records on {}'.format(incremental_id, timestamp))
        file_name = '{}-{}-{}.json'.format(collection, timestamp, incremental_id)
        upload_to_bucket(bucket, dir_path, file_name, json_dump)
    except Exception as e:
        # log.error(' '.join([s3_output_dir, collection, timestamp, incremental_id]))
        log.error(traceback.print_exc())
        exit(1)
        
def download_model(bucket_name,general_path, prefix, job_name): ## example to download any file on S3
    bucket_name = 'bucket_name'
    general_path = 'dir_path'
    file_name = '/output/model.tar.gz'
    download_name = prefix + job_name + '.tar.gz'
    if not os.path.exists(prefix):
        os.makedirs(prefix)
    
    s3 = boto3.client('s3')
    s3.download_file(bucket_name, general_path + prefix + job_name + file_name, download_name)
    
    return download_name

def read_file(bucketname, itemname ): ## instaed of download
    s3 = boto3.resource('s3')
    obj = s3.Object(bucketname, itemname)
    body = obj.get()['Body'].read()
    
def load_model(cols_input, model_location='model.tar.gz'):
    tar = tarfile.open(model_location)
    tar.extractall()
    tar.close()
    model = pkl.load(open('xgboost-model', 'rb'))
    map_names = dict(zip(model.feature_names,cols_input))
    model.feature_names = list(map_names.values())
    return model

def get_test_data(bucket_name, general_path,  prefix, job_name):

    file_name = 'test/test.csv'
    download_name = prefix + 'test.csv'
    s3 = boto3.client('s3')
    s3.download_file(bucket_name, general_path + prefix_diag + file_name, download_name)
    return download_name
