In [1]:
# import pandas as pd

# import boto3
# import re
# import os
# import sagemaker
# from sagemaker import get_execution_role

# bucket = sagemaker.Session().default_bucket()        
# region = boto3.Session().region_name    
# role = get_execution_role()
# bucket

In [1]:
import pandas as pd
import numpy as np
from tsfresh import select_features
import xgboost
import seaborn as sns
import matplotlib.pyplot as plt

import math, datetime 


from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
# from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from feature_selection import FeatureSelector

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

%matplotlib

Using matplotlib backend: Qt5Agg


In [4]:
# inputBucket='cmg-kudzu-text-analysis' 

# data_key = 'glue/test/exhaustive_extracted_features.pkl'
# exhaustive_extracted_features_s3file = '{}/{}'.format(inputBucket, data_key)

# data_key = 'glue/test/y_df_for_feature_selection.pkl'
# y_df_for_feature_selection_s3file = '{}/{}'.format(inputBucket, data_key)

# data_key = 'glue/test/full_features.csv'
# original_full_features_s3file = '{}/{}'.format(inputBucket, data_key)

# print('Location {} {}'.format(exhaustive_extracted_features_s3file,y_df_for_feature_selection_s3file))

In [5]:
# import s3fs
# import pandas as pd
# fs = s3fs.S3FileSystem(anon=False)

# # Pickle
# with fs.open(exhaustive_extracted_features_s3file) as f:
#     full_features = pd.read_pickle(f)

# with fs.open(y_df_for_feature_selection_s3file) as f:
#     y_df = pd.read_pickle(f)

# # with fs.open(original_full_features_s3file) as f:
# #     original_full_features = pd.read_csv(f)


In [2]:
# all extracted features from tsfresh
# the extracted features are the same for all models
full_features = pd.read_pickle('/Users/Rohil/Documents/iGEM/yemen/exhaustive_extracted_features_new.pkl')

# test values for different time frames (in a nice format)
y_df = pd.read_pickle('/Users/Rohil/Documents/iGEM/yemen/y_df_for_feature_selection_new.pkl')

In [3]:
full_features = pd.concat([full_features, pd.get_dummies(full_features.gov_iso)], axis=1)

In [4]:
# ignoring anything that happens before Jul 1, as there's not enough data for feature calculation
full_features = full_features[full_features.date>='2017-07-01']
y_df = y_df[y_df.date>='2017-07-01']

In [5]:
ESTIMATORS = 750
EARLY_STOPPING_ROUNDS = 50

In [6]:
def scale_features(scaler, X):

    continuous_cols = [col for col in X.columns if 'YE-' not in col]
    
    X_scaled = X
    
    X_scaled[continuous_cols] = scaler.transform(X[continuous_cols])
    
    return (X_scaled)

In [7]:
def train_cv_test_split(data, cv_split_date='2017-09-17', test_split_date='2017-11-06'):
    
    data_train, data_cv, data_test = data[(data.date<cv_split_date)], data[(data.date<test_split_date) & (data.date>=cv_split_date)], data[ (data.date>=test_split_date) ]
    
    return (data_train.drop('date', axis=1), data_cv.drop('date', axis=1), data_test.drop('date', axis=1))

In [8]:
HOLD_OUT_WINDOW = 100
ROLLING_WINDOW_SIZE = 60
WINDOW = round(ROLLING_WINDOW_SIZE/3)

def getMaxDate():
    return y_df.date.max() - datetime.timedelta(days=HOLD_OUT_WINDOW)

def getMinDate():
    return y_df.date.min()

delta = getMaxDate() - getMinDate()
print('Start {} Stop {} No of days {}'.format(getMinDate(), getMaxDate(), delta.days))

number_rolling_windows = math.ceil(round(delta.days/WINDOW))  - 1
number_rolling_windows


Start 2017-07-01 00:00:00 Stop 2017-11-10 00:00:00 No of days 132


6

In [9]:
def getHoldOutDate():
    start = getMaxDate() + datetime.timedelta(days=1)
    end   = y_df.date.max() 
    
    return (start, end)

In [10]:
def getRollingWindowDates(idx):
    maxDate = getMaxDate()  
    minDate = getMinDate() 
    
    trainStart = minDate
    trainStop  = minDate + datetime.timedelta(days=WINDOW*idx)
    
    validationStart  = trainStop + datetime.timedelta(days=1)
    validationStop   = validationStart + datetime.timedelta(days=WINDOW)
    
    testStart  = validationStop + datetime.timedelta(days=1)
    testStop   = testStart + datetime.timedelta(days=WINDOW)
    
    if (maxDate - testStop).days < WINDOW:
        print('Rolling window to end date')
        testStop = maxDate
        
    print('Train [{} {}] Val [{} {}] Test [{} {}]'.format(trainStart.date(), trainStop.date(), 
                                                       validationStart.date(), validationStop.date(), 
                                                       testStart.date(), testStop.date()))
    
    return (trainStart, trainStop, validationStart, validationStop, testStart, testStop)

In [11]:
def getRollingWindow(data, trainStart, trainStop, validationStart, validationStop, testStart, testStop):
    
    train = data[(data.date >= trainStart) & (data.date <= trainStop)]
    val   = data[(data.date >= validationStart) & (data.date <= validationStop)]
    test  = data[(data.date >= testStart) & (data.date <= testStop)] 
    
    print('Window Train/Val/Test shape {} {} {}'.format(train.shape, val.shape, test.shape))
    
    return (train.drop('date', axis=1), val.drop('date', axis=1), test.drop('date', axis=1))

In [12]:
def getHoldOutData(data):
    minDate = getMinDate() 
    start, end = getHoldOutDate()

    train = data[(data.date >= minDate) & (data.date < start)]
    test =  data[(data.date >= start) & (data.date <= end)]
    
    return (train.drop('date', axis=1), test.drop('date', axis=1))

In [13]:
number_rolling_windows, round((ROLLING_WINDOW_SIZE)/3), getHoldOutDate()

(6, 20, (Timestamp('2017-11-11 00:00:00'), Timestamp('2018-02-18 00:00:00')))

In [14]:
class TsFresh():
    def __init__(self):
        pass
    
    def postProcessor(self, X, y, dateSeries):
        self.selected_features = select_features(X, y, fdr_level=0.001)
        print('Selected features {}'.format(self.selected_features.shape))      
        self.selected_features = pd.concat([dateSeries, self.selected_features], axis=1)

In [15]:
class OptimizeFeatures():
    def __init__(self):
        pass
        
    def selectFeatures(self, X, y):
        self.X  = X
        self.y  = y
        self.fs = FeatureSelector(data = X, labels = y)        
    
    def identifyCollinearFeatures(self, correlation_threshold=0.975):
        self.fs.identify_collinear(correlation_threshold)
    
    def collinerFeaturesColumnsToKeep(self):
        return self.fs.ops['collinear']
    
    def removeCollinerFeatures(self):
        self.cols_to_keep = set(self.X.columns) - set(self.fs.ops['collinear'])
        self.corr_selected_features = self.X[list(self.cols_to_keep)]


In [16]:
class ModelCustomRegressor():
    
    def __init__(self):
        pass
    
    def extract(self, model_params, X_train, X_cv, X_test, y_train, y_cv, y_test):
        print('Creating baseline model to extract features')
        
        X_train_cv = X_train.append(X_cv)
        y_train_cv = y_train.append(y_cv)
        
        scaler = StandardScaler()
        continuous_cols = [col for col in X_train_cv.columns if 'YE-' not in col]
        scaler.fit(X_train_cv[continuous_cols])

        X_train_cv, X_test = scale_features(scaler, X_train_cv), scale_features(scaler, X_test)
        
        print('all features {}'.format(X_train_cv.shape))
            
        eval_set = [(X_test, y_test)]
    
        self.feature_importance_df = pd.DataFrame(index = X_train.columns)
        
        self.regressor = xgboost.XGBRegressor(**model_params)                                      
        
        self.regressor.fit(X_train_cv, y_train_cv, eval_metric='rmse', 
                                  eval_set=eval_set, 
                                  early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=False)

        preds = self.regressor.predict((X_test))
        
        self.feature_importance_df['threshold'] = self.regressor.feature_importances_
        
        self.thresholds = np.unique(self.regressor.feature_importances_)
        self.thresholds.sort();
        self.thresholds = self.thresholds[::-1][:35]
        
        print('Thresholds are {} {} {}'.format(len(self.regressor.feature_importances_), len(self.thresholds), self.thresholds))

In [17]:
class RollingWindowCrossValidation():
    
    def __init__(self, corr_selected_features_bi_week, corr_selected_features_bi_week_y, preselect_params):
        self.corr_selected_features_bi_week = corr_selected_features_bi_week
        self.corr_selected_features_bi_week_y   = corr_selected_features_bi_week_y
        self.preselect_params = preselect_params
        pass
    
    def extract(self, regressor, thresh):
        
        mse_list = []
        
        for idx in range(3, number_rolling_windows):
            print('CV - Window {}'.format(idx))
            trainStart, trainStop, validationStart, validationStop, testStart, testStop = getRollingWindowDates(idx)
            
            X_train, X_cv, X_test = getRollingWindow(self.corr_selected_features_bi_week, 
                                                     trainStart, trainStop, 
                                                     validationStart, validationStop, testStart, testStop)
            y_train, y_cv, y_test = getRollingWindow(self.corr_selected_features_bi_week_y, 
                                                     trainStart, trainStop, 
                                                     validationStart, validationStop, testStart, testStop)
            
            X_train_cv = X_train.append(X_cv)
            y_train_cv = y_train.append(y_cv)
            
            scaler = StandardScaler()
            continuous_cols = [col for col in X_train_cv.columns if 'YE-' not in col]
            scaler.fit(X_train_cv[continuous_cols])

            X_train_cv, X_test = scale_features(scaler, X_train_cv), scale_features(scaler, X_test)
        
            print('X_train_cv  {}'.format(X_train_cv.shape))
            
            # select features using threshold
            selection = SelectFromModel(regressor, threshold=thresh, prefit=True)
            select_X_train_cv = selection.transform(X_train_cv)
            
            select_X_test = selection.transform(X_test)
            
            eval_set = [(select_X_test, y_test)]
            
            # train model
            selection_model = xgboost.XGBRegressor(**self.preselect_params)
            
            selection_model.fit(select_X_train_cv, y_train_cv, eval_metric='rmse', 
                                      eval_set=eval_set, 
                                      early_stopping_rounds=50,
                                      verbose=False)
            # eval model
            y_pred = selection_model.predict(select_X_test)
           
            mse = mean_squared_error(y_test, y_pred)

            print("%d Thresh=%.5f, n=%d, mse: %.3f, mse: %.4f" % (idx, thresh, select_X_train_cv.shape[1], mse))
            
            mse_list.append(mse)
            ev_list.append(ev)
        
        lst = [thresh, select_X_train_cv.shape[1]] + mse_list
        return_dict = dict(zip(['threshold', 'num_features', 'mse4', 'mse5', 'mse6', 'mse7', 'mse8'], lst))
        print (return_dict)
        return (return_dict)
        

In [18]:
class RollingCustomFeatureExtractor():
    
    def __init__(self, corr_selected_features_bi_week, corr_selected_features_bi_week_y, preselect_params):
        self.corr_selected_features_bi_week  = corr_selected_features_bi_week
        self.corr_selected_features_bi_week_y   = corr_selected_features_bi_week_y
        self.preselect_params = preselect_params
        pass
    
    def extract(self):
        X = self.corr_selected_features_bi_week 
        y = self.corr_selected_features_bi_week_y 
        
        X_train, X_cv, X_test = train_cv_test_split(X)
        y_train, y_cv, y_test = train_cv_test_split(y)
        
        self.mcr = ModelCustomRegressor()
        self.mcr.extract(self.preselect_params, X_train, X_cv, X_test, y_train, y_cv, y_test) 
            
        thresholds = self.mcr.thresholds        
    
        self.rwcv = RollingWindowCrossValidation(self.corr_selected_features_bi_week, 
                                                 self.corr_selected_features_bi_week_y,
                                                 self.preselect_params)
        
        self.summary = pd.DataFrame(columns = ['threshold', 'num_features', 'mse3', 'mse4', 'mse5', 'ev3', 'ev4', 'ev5' ])
        
        for thresh in thresholds:
            return_dict = self.rwcv.extract(self.mcr.regressor, thresh)
            
            self.summary = self.summary.append(return_dict, ignore_index = True)
            
            print("\n")
            
        print(self.summary.head(20))


In [19]:
def set_style(color):
    plt.style.use(['seaborn-' + color, 'seaborn-paper'])

In [20]:
def plot_pred_against_actual(pred_crosstab_dict, true_crosstab, y_label):
    
    set_style('white')
    
    fig, ax = plt.subplots(21,1,figsize = (6,15), sharex=True, sharey = False)
    
    cols = true_crosstab.columns
    
    minDate = getMinDate() 
    start, end = getHoldOutDate()
    
    train_pred_crosstab = {}
    test_pred_crosstab = {}
    
    for key, value in pred_crosstab_dict.items():

        train_pred_crosstab[key] = pred_crosstab_dict[key].loc[(pred_crosstab_dict[key].index >= minDate) & (pred_crosstab_dict[key].index < start)]
        test_pred_crosstab[key] =  pred_crosstab_dict[key].loc[(pred_crosstab_dict[key].index >= start) & (pred_crosstab_dict[key].index <= end)]
    
    for i in range(0,21):
        
        true_crosstab[cols[i]].plot(kind='line', ax = ax[i], label = 'true_val', legend = True, color = 'red')

        ax[i].set_prop_cycle('color', ['seagreen', 'blue', 'plum', 'magenta'])
        
        for key, value in pred_crosstab_dict.items():
                    
            train_pred_crosstab[key][cols[i]].plot(kind='line', ax = ax[i], label= key + ' train-pred',linestyle= '-.', legend = True)
        
            test_pred_crosstab[key][cols[i]].plot(kind='line', ax = ax[i], label= key + ' test-pred', linestyle= '-.',legend = True)   

          
        ax[i].legend().set_visible(False)
        ax[i].set_ylabel(cols[i])
        ax[i].yaxis.set_label_position('right')
        ax[i].spines['right'].set_visible(False)
        ax[i].spines['top'].set_visible(False)
        ax[i].spines['bottom'].set_visible(True)
        
    ax[10].legend().set_visible(True)
    ax[10].legend(fontsize=10, loc='center left', bbox_to_anchor=(1.05, 0.5))
 
    fig.subplots_adjust(hspace = .2)    
        
    fig.savefig('/Users/Rohil/Documents/iGEM/yemen/' + y_label + '_deployed.png', dpi = 500, bbox_inches = 'tight')
    
    plt.close('all')

In [55]:
class BayesianOptimizer():

    def __init__(self, corr_selected_features_bi_week, corr_selected_features_bi_week_y, max_evals):
        self.corr_selected_features_bi_week  = corr_selected_features_bi_week
        self.corr_selected_features_bi_week_y   = corr_selected_features_bi_week_y
        self.max_evals = max_evals
    
    def objective(self, space):
    
        mse_list = []

        for idx in range(3, number_rolling_windows):

            trainStart, trainStop, validationStart, validationStop, testStart, testStop = getRollingWindowDates(idx)

            X_train, X_cv, X_test = getRollingWindow(self.corr_selected_features_bi_week, 
                                                     trainStart, trainStop, 
                                                     validationStart, validationStop, testStart, testStop)
            y_train, y_cv, y_test = getRollingWindow(self.corr_selected_features_bi_week_y, 
                                                     trainStart, trainStop, 
                                                     validationStart, validationStop, testStart, testStop)

            X_train_cv = X_train.append(X_cv)
            y_train_cv = y_train.append(y_cv)

            scaler = StandardScaler()
            continuous_cols = [col for col in X_train_cv.columns if 'YE-' not in col]
            scaler.fit(X_train_cv[continuous_cols])

            X_train_cv, X_test = scale_features(scaler, X_train_cv), scale_features(scaler, X_test)

            xgb = xgboost.XGBRegressor(n_estimators = space['n_estimators'],
                           max_depth = space['max_depth'],
                           min_child_weight = space['min_child_weight'],
                           subsample = space['subsample'],
                           learning_rate = space['learning_rate'],
                           gamma = space['gamma'],
                           colsample_bytree = space['colsample_bytree'],
                           objective='reg:linear', n_jobs = -1
                           )    

            xgb.fit(X_train_cv ,y_train_cv, eval_metric = 'rmse')

            # eval model
            y_pred = xgb.predict(X_test)

            mse = mean_squared_error(y_test, y_pred)    

            mse_list.append(mse)
            
        print(mse_list)    

        return_dict = dict(zip(['mse3', 'mse4', 'mse5'], mse_list))

        return_dict['loss'] = np.mean(mse_list)
        return_dict['status'] = STATUS_OK

        print ("mean mse:", return_dict['loss'])

        return (return_dict)

    def run(self):
        
        self.space ={'max_depth': hp.choice('max_depth', np.arange(3, 12, dtype=int)),
                'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
                'subsample':  hp.quniform('subsample', 0.3, 1, 0.05),
                'n_estimators' : hp.quniform('n_estimators', 50, 750, 50),
                'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.025),
                'gamma' : hp.quniform('gamma', 0, 5, 0.5),
                'colsample_bytree' : hp.quniform('colsample_bytree', 0.3, 1, 0.05)}
        
        self.trials = Trials()
        self.best = fmin(fn=self.objective,
                    space=self.space,
                    algo=tpe.suggest,
                    max_evals=self.max_evals, 
                    trials=self.trials)



In [22]:
class DeployRegressor():
    
    def __init__(self):
        pass
    
    def execute(self, model_params, X_train, X_test, y_train, y_test, y_to_plot, y_bi_week_label):
        print('Creating baseline model to train on final features')
        
        continuous_cols = [col for col in X_train.columns if 'YE-' not in col]

        scaler = StandardScaler()
        scaler.fit(X_train[continuous_cols])

        X_train, X_test = scale_features(scaler, X_train), scale_features(scaler, X_test)
        
        self.xgb = xgboost.XGBRegressor(**model_params)
        print (self.xgb)
        
        self.linreg = LinearRegression()
        
        self.xgb.fit(X_train, y_train, eval_metric='rmse', verbose=False)
        self.linreg.fit(X_train,y_train)
        
        print (self.xgb)

        y_pred_xgb = self.xgb.predict((X_test))
        y_pred_all_xgb = self.xgb.predict(X_train.append(X_test))
        
        y_pred_linreg = self.linreg.predict((X_test))
        y_pred_all_linreg = self.linreg.predict(X_train.append(X_test))
        
        y_test_merged = y_test.merge(y_to_plot, how='left', left_index=True, right_index=True, copy=False)
        
        y_pred_all_xgb = pd.DataFrame(y_pred_all_xgb, columns=[y_bi_week_label], index = y_to_plot.index)
        y_pred_all_xgb[['gov_iso', 'date']] = y_to_plot[['gov_iso', 'date']]
        
        y_pred_all_linreg = pd.DataFrame(y_pred_all_linreg, columns=[y_bi_week_label], index = y_to_plot.index)
        y_pred_all_linreg[['gov_iso', 'date']] = y_to_plot[['gov_iso', 'date']]
        
        fig, ax = plt.subplots(figsize=(6,15))
        
        true_val_pivot = y_to_plot.pivot_table(index = 'date', columns = 'gov_iso', values = y_bi_week_label, aggfunc='sum')
        xgb_pred_val_pivot = y_pred_all_xgb.pivot_table(index = 'date', columns = 'gov_iso', values = y_bi_week_label, aggfunc='sum')
        linreg_pred_val_pivot = y_pred_all_linreg.pivot_table(index = 'date', columns = 'gov_iso', values = y_bi_week_label, aggfunc='sum')
 
        pred_crosstab_dict = {'xgboost':xgb_pred_val_pivot}#, 'linreg':linreg_pred_val_pivot}

        plot_pred_against_actual(pred_crosstab_dict, true_val_pivot, y_bi_week_label)
        
        fig1, ax1 = plt.subplots()
        xgboost.plot_importance(self.xgb, ax=ax1)
        fig1.savefig('/Users/Rohil/Documents/iGEM/yemen/' + y_bi_week_label + '_deployed_feature_importances.png', dpi = 300, bbox_inches = 'tight')
        plt.close('all')
        
        mse_xgb = mean_squared_error(y_test, y_pred_xgb)
        ev_xgb = explained_variance_score(y_test, y_pred_xgb)
        
        mse_linreg = mean_squared_error(y_test, y_pred_linreg)
        ev_linreg = explained_variance_score(y_test, y_pred_linreg)

        print('y-test mean {}, y-test std {}'.format(np.mean(y_test.values), np.std(y_test.values)))
        print('xgb mse {}, xgb ev {}'.format(mse_xgb, ev_xgb))
        print('linreg mse {}, linreg ev {}'.format(mse_linreg, ev_linreg))

        

In [23]:
class Orchestrator():
    
    def __init__(self, full_data_bi_week, y_bi_week_label):
        self.full_data_bi_week = full_data_bi_week
        self.y_bi_week_label   = y_bi_week_label
        pass
    
    def runTsFresh(self):
        print('Running TSFresh....')
        
        X_ts = self.full_data_bi_week.drop(columns=['date', 'gov_iso', y_bi_week_label])
        y_ts = self.full_data_bi_week[self.y_bi_week_label]
        dateSeries_ts = self.full_data_bi_week.date
        
        self.tf = TsFresh()
        self.tf.postProcessor(X_ts, y_ts, dateSeries_ts)
        
        print('Finished running TSFresh....')
        
    def runOptimizeFeatures(self):
        print('Running Feature Selection module ....')
        
        y = self.full_data_bi_week[self.y_bi_week_label]
            
        self.op = OptimizeFeatures()
        self.op.selectFeatures(self.tf.selected_features, y)
        self.op.identifyCollinearFeatures(0.975)
        
        self.op.removeCollinerFeatures()
        print('Original {} and after {}'.format(self.op.X.shape, self.op.corr_selected_features.shape))
        
        print('Finished running Feature Selection ....')
        
    def performHyperparameterOptimization(self, X, max_evals):
        self.bo = BayesianOptimizer(X, self.full_data_bi_week[['date', self.y_bi_week_label]], max_evals)
        self.bo.run()
    
    def runRollingCustomFeatureExtractor(self, preselect_params, corr_selected_features):
        print('Running custom feature selection module ....')
        
        # user can specify their corr_selected_features if they please (if the job has been batched)
        if corr_selected_features is None:
            self.corr_selected_features_bi_week = self.op.corr_selected_features
        else:
            self.corr_selected_features_bi_week = corr_selected_features
        
        self.corr_selected_features_bi_week_y = self.full_data_bi_week[['date', self.y_bi_week_label]]

        #corr_selected_features_4_6 = pd.concat([full_data_4_6.date, corr_selected_features_4_6], axis=1)
        #corr_selected_features_4_6.date.head()

        """
        X = corr_selected_features_4_6 
        y = corr_selected_features_4_6_y 
        
        X_train, X_cv, X_test = train_cv_test_split(X)
        y_train, y_cv, y_test = train_cv_test_split(y)
        """

        self.rcfe = RollingCustomFeatureExtractor(self.corr_selected_features_bi_week, 
                                                  self.corr_selected_features_bi_week_y, preselect_params)
        self.rcfe.extract()

    def deploy(self, threshold, model_params):
        
        #selected_features_from_threshold = list(self.rcfe.mcr.feature_importance_df[self.rcfe.mcr.feature_importance_df.threshold >= 0.003145].index)
        selected_features_from_threshold = list(self.rcfe.mcr.feature_importance_df[self.rcfe.mcr.feature_importance_df.threshold >= threshold].index)
        selected_features_from_threshold.append('date')

        X = self.corr_selected_features_bi_week[selected_features_from_threshold]
        y = self.corr_selected_features_bi_week_y 
        
        X['days_from'] = self.full_data_bi_week.days_from
        categorical_cols =  [col for col in self.full_data_bi_week.columns if 'YE-' in col]
        
        X_train, X_hold_test = getHoldOutData(X)
        y_train, y_hold_test = getHoldOutData(y)

        y_to_plot = self.corr_selected_features_bi_week_y.merge(full_features, on = 'date', left_index=True, right_index=True, how = 'left')[[self.y_bi_week_label, 'gov_iso', 'date']]
                
        self.dr = DeployRegressor()
        self.dr.execute(model_params, X_train, X_hold_test, y_train, y_hold_test, y_to_plot, self.y_bi_week_label)
        
        

In [24]:
print (y_df.date.min())
print (y_df.date.max())

2017-07-01 00:00:00
2018-02-18 00:00:00


In [25]:
y1_2 = y_df[['date', 'gov_iso', 'week_1_to_2_cases']]
y2_4 = y_df[['date', 'gov_iso', 'week_2_to_4_cases']]
y4_6 = y_df[['date', 'gov_iso', 'week_4_to_6_cases']]
y6_8 = y_df[['date', 'gov_iso', 'week_6_to_8_cases']]

In [26]:
# instead of creating copies here, will run these lines directly in the object instantiation
# full_data_1_2 = y1_2.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date')
# full_data_2_4 = y2_4.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date')
# full_data_4_6 = y4_6.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date')
# full_data_6_8 = y6_8.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date')

In [29]:
# running feature selection for 1-2 week model

In [38]:
# orchestrator12 = Orchestrator(full_features, full_data_1_2, 'week_1_to_2_cases' )

In [39]:
# orchestrator12.runTsFresh()

Running TSFresh....
Selected features (4599, 15252)
Finished running TSFresh....


In [40]:
# orchestrator12.runOptimizeFeatures()

Running Feature Selection module ....
4825 features with a correlation magnitude greater than 0.97.

Original (4599, 15253) and after (4599, 10428)
Finished running Feature Selection ....


In [62]:
# X_12_preselect = orchestrator12.op.corr_selected_features
# y_12_preselect = orchestrator12.full_data_bi_week[['date', orchestrator12.y_bi_week_label]]

In [63]:
# X_12_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_12_preselect.csv')
# y_12_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_12_preselect.csv')

In [36]:
orchestrator12 = Orchestrator(y1_2.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date'), 'week_1_to_2_cases' )

In [37]:
X_12_preselect = pd.read_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_12_preselect.csv', index_col = 0)
X_12_preselect.date = pd.to_datetime(X_12_preselect.date, format = '%Y-%m-%d')

In [38]:
orchestrator12.performHyperparameterOptimization(X_12_preselect, 15)

Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-20] Test [2017-09-21 2017-10-11]
Window Train/Val/Test shape (1281, 10428) (441, 10428) (441, 10428)
Window Train/Val/Test shape (1281, 2) (441, 2) (441, 2)
Rolling window to end date
Train [2017-07-01 2017-09-19] Val [2017-09-20 2017-10-10] Test [2017-10-11 2017-11-10]
Window Train/Val/Test shape (1701, 10428) (441, 10428) (651, 10428)
Window Train/Val/Test shape (1701, 2) (441, 2) (651, 2)
Rolling window to end date
Train [2017-07-01 2017-10-09] Val [2017-10-10 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2121, 10428) (441, 10428) (231, 10428)
Window Train/Val/Test shape (2121, 2) (441, 2) (231, 2)
[158.14729219482425, 36.276595075232485, 12.067594368179682]
mean mse: 68.83049387941213
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-20] Test [2017-09-21 2017-10-11]
Window Train/Val/Test shape (1281, 10428) (441, 10428) (441, 10428)
Window Train/Val/Test shape (1281, 2) (441, 2) (441, 2)
Rolling windo

Window Train/Val/Test shape (1701, 2) (441, 2) (651, 2)
Rolling window to end date
Train [2017-07-01 2017-10-09] Val [2017-10-10 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2121, 10428) (441, 10428) (231, 10428)
Window Train/Val/Test shape (2121, 2) (441, 2) (231, 2)
[130.70873316698732, 51.32581777659676, 8.837462121635722]
mean mse: 63.62400435507326
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-20] Test [2017-09-21 2017-10-11]
Window Train/Val/Test shape (1281, 10428) (441, 10428) (441, 10428)
Window Train/Val/Test shape (1281, 2) (441, 2) (441, 2)
Rolling window to end date
Train [2017-07-01 2017-09-19] Val [2017-09-20 2017-10-10] Test [2017-10-11 2017-11-10]
Window Train/Val/Test shape (1701, 10428) (441, 10428) (651, 10428)
Window Train/Val/Test shape (1701, 2) (441, 2) (651, 2)
Rolling window to end date
Train [2017-07-01 2017-10-09] Val [2017-10-10 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2121, 10428) (441, 10428) (

In [None]:
orchestrator12.runRollingCustomFeatureExtractor()

In [None]:
orchestrator12.rcfe.summary[['mean_mse', 'mean_ev'] ]= pd.concat([orchestrator12.rcfe.summary[['mse3', 'mse4', 'mse5']].mean(axis=1), orchestrator12.rcfe.summary[['ev3', 'ev4', 'ev5']].mean(axis=1)], axis=1)

In [None]:
orchestrator12.rcfe.summary

In [41]:
# running feature selection for 2-4 week model

In [43]:
#orchestrator24.runTsFresh()

Running TSFresh....
Selected features (4305, 15146)
Finished running TSFresh....


In [44]:
#orchestrator24.runOptimizeFeatures()

Running Feature Selection module ....
4810 features with a correlation magnitude greater than 0.97.

Original (4305, 15147) and after (4305, 10337)
Finished running Feature Selection ....


In [64]:
# X_24_preselect = orchestrator24.op.corr_selected_features
# y_24_preselect = orchestrator24.full_data_bi_week[['date', orchestrator24.y_bi_week_label]]

In [65]:
# X_24_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_24_preselect.csv')
# y_24_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_24_preselect.csv')

In [33]:
orchestrator24 = Orchestrator(y2_4.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date'), 'week_2_to_4_cases' )

In [34]:
X_24_preselect = pd.read_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_24_preselect.csv', index_col = 0)
X_24_preselect.date = pd.to_datetime(X_24_preselect.date, format = '%Y-%m-%d')

In [48]:
orchestrator24.performHyperparameterOptimization(X_24_preselect, 15)

NameError: name 'selected_features24' is not defined

In [46]:
orchestrator24.bo.trials.results

[{'mse3': 155.04661677842435,
  'mse4': 48.39547453326726,
  'mse5': 16.40732532428958,
  'loss': 73.28313887866041,
  'status': 'ok'},
 {'mse3': 90.59251910226182,
  'mse4': 39.77314941415267,
  'mse5': 19.462097068701862,
  'loss': 49.94258852837211,
  'status': 'ok'},
 {'mse3': 118.10273345619753,
  'mse4': 30.99137676337501,
  'mse5': 16.27247995375718,
  'loss': 55.122196724443235,
  'status': 'ok'},
 {'mse3': 125.74905921200056,
  'mse4': 49.61496894541222,
  'mse5': 18.907734017562326,
  'loss': 64.75725405832503,
  'status': 'ok'},
 {'mse3': 129.7935257717194,
  'mse4': 28.34819732021623,
  'mse5': 21.45706009585455,
  'loss': 59.86626106259673,
  'status': 'ok'},
 {'mse3': 93.35559516784028,
  'mse4': 52.31376863824956,
  'mse5': 17.072143802279985,
  'loss': 54.24716920278994,
  'status': 'ok'},
 {'mse3': 99.04239002646067,
  'mse4': 41.23309796099031,
  'mse5': 15.4869799107443,
  'loss': 51.920822632731756,
  'status': 'ok'},
 {'mse3': 128.43775647920617,
  'mse4': 40.56352

In [31]:
preselect_params_24 = {"alpha":0.370220603,
                        "colsample_bytree":0.313402452,
                        "early_stopping_rounds":100,
                        "eta":0.010141415,
                        "gamma":4,
                        "max_depth":10,
                        "min_child_weight":7.460119372,
                        "num_round":666,
                        "objective":"reg:linear",
                        "rate_drop":0.3,
                        "silent":1,
                        "subsample":0.617982585,
                        "tweedie_variance_power":1.4}

In [37]:
orchestrator24.runRollingCustomFeatureExtractor(preselect_params_24, X_24_preselect)

Running custom feature selection module ....
Creating baseline model to extract features
all features (2688, 10336)
Thresholds are 10336 12 [0.00473244 0.00400437 0.0032763  0.00291227 0.00254823 0.0021842
 0.00182017 0.00145613 0.0010921  0.00072807 0.00036403 0.        ]
CV - Window 3
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-20] Test [2017-09-21 2017-10-11]
Window Train/Val/Test shape (1281, 10337) (441, 10337) (441, 10337)
Window Train/Val/Test shape (1281, 2) (441, 2) (441, 2)
X_train_cv  (1722, 10336)
3 Thresh=0.00473, n=1, mae: 6.714, mae: 0.5205
CV - Window 4
Rolling window to end date
Train [2017-07-01 2017-09-19] Val [2017-09-20 2017-10-10] Test [2017-10-11 2017-11-10]
Window Train/Val/Test shape (1701, 10337) (441, 10337) (651, 10337)
Window Train/Val/Test shape (1701, 2) (441, 2) (651, 2)
X_train_cv  (2142, 10336)
4 Thresh=0.00473, n=1, mae: 4.789, mae: 0.6182
CV - Window 5
Rolling window to end date
Train [2017-07-01 2017-10-09] Val [2017-10-10 2017-10-30] Test

In [38]:
orchestrator24.rcfe.summary[['mean_mse', 'mean_ev'] ]= pd.concat([orchestrator24.rcfe.summary[['mse3', 'mse4', 'mse5']].mean(axis=1), orchestrator24.rcfe.summary[['ev3', 'ev4', 'ev5']].mean(axis=1)], axis=1)

In [39]:
orchestrator24.rcfe.summary

Unnamed: 0,threshold,num_features,mae3,mae4,mae5,ev3,ev4,ev5,mean_mae,mean_ev
0,0.004732,1.0,6.713857,4.78851,3.058837,0.520485,0.618186,0.737413,4.853735,0.625361
1,0.004004,7.0,6.066353,4.482788,2.960974,0.594197,0.684869,0.788092,4.503371,0.689053
2,0.003276,9.0,6.1502,4.579348,2.617321,0.595059,0.692122,0.842587,4.448956,0.709923
3,0.002912,11.0,5.195972,4.187902,2.098032,0.689446,0.728749,0.872849,3.827302,0.763681
4,0.002548,14.0,5.29008,4.125579,2.330331,0.655798,0.744098,0.865379,3.91533,0.755092
5,0.002184,19.0,4.849784,3.881546,2.153589,0.664814,0.758317,0.897309,3.628306,0.77348
6,0.00182,28.0,5.307311,3.750501,2.223029,0.613516,0.81137,0.88707,3.76028,0.770652
7,0.001456,61.0,5.241932,3.443652,2.113019,0.616067,0.843697,0.889245,3.599534,0.783003
8,0.001092,121.0,4.635046,3.351,1.719289,0.718803,0.851212,0.944432,3.235112,0.838149
9,0.000728,448.0,4.597799,3.295822,1.849746,0.726613,0.844408,0.921553,3.247789,0.830858


In [294]:
selected_features24 = ['date'] + list(orchestrator24.rcfe.mcr.feature_importance_df[orchestrator24.rcfe.mcr.feature_importance_df.threshold >= 0.002384].index)

In [295]:
orchestrator24.corr_selected_features_bi_week[selected_features24].to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_2_4_postselect.csv')

In [296]:
orchestrator24.corr_selected_features_bi_week_y.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_2_4.csv')

In [297]:
orchestrator24.deploy(0.002384)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Creating baseline model to train on final features
y-test mean 5.756598093022273, y-test std 6.644771694378367
xgb mae 2.6350936469411717, xgb ev 0.6847313632395204
linreg mae 7.18744640746357, linreg ev -0.43011188106644305


In [45]:
# running feature selection for 4-6 week model

In [46]:
#orchestrator46 = Orchestrator(full_features, full_data_4_6, 'week_4_to_6_cases' )

In [47]:
# orchestrator46.runTsFresh()

Running TSFresh....
Selected features (4011, 15156)
Finished running TSFresh....


In [48]:
# orchestrator46.runOptimizeFeatures()

Running Feature Selection module ....
4773 features with a correlation magnitude greater than 0.97.

Original (4011, 15157) and after (4011, 10384)
Finished running Feature Selection ....


In [66]:
# X_46_preselect = orchestrator46.op.corr_selected_features
# y_46_preselect = orchestrator46.full_data_bi_week[['date', orchestrator46.y_bi_week_label]]

In [67]:
# X_46_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_46_preselect.csv')
# y_46_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_46_preselect.csv')

In [27]:
orchestrator46 = Orchestrator(y4_6.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date'), 'week_4_to_6_cases' )

In [51]:
X_46_preselect = pd.read_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_46_postselect.csv', index_col = 0)
X_46_preselect.date = pd.to_datetime(X_46_preselect.date, format = '%Y-%m-%d')

In [56]:
orchestrator46.performHyperparameterOptimization(X_46_preselect, 5)

Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-20] Test [2017-09-21 2017-10-11]
Window Train/Val/Test shape (1281, 38) (441, 38) (441, 38)
Window Train/Val/Test shape (1281, 2) (441, 2) (441, 2)


TypeError: 'float' object cannot be interpreted as an integer

In [53]:
orchestrator46.bo.best

{'colsample_bytree': 0.75,
 'gamma': 1.5,
 'learning_rate': 0.07500000000000001,
 'max_depth': 7,
 'min_child_weight': 7.0,
 'n_estimators': 9,
 'subsample': 0.65}

In [128]:
preselect_params_46 = {"alpha":1.871528645,
                        "colsample_bytree":0.735780303,
                        "eta":0.447389973,
                        "gamma":5,
                        "max_depth":3,
                        "min_child_weight":5.308204269,
                        "n_estimators":245,
                        "subsample":0.700554693}

In [129]:
orchestrator46.runRollingCustomFeatureExtractor(preselect_params_46, X_46_preselect)

Running custom feature selection module ....
Creating baseline model to extract features
all features (2688, 10383)
Thresholds are 10383 8 [0.00988701 0.00847458 0.00706215 0.00564972 0.00423729 0.00282486
 0.00141243 0.        ]
CV - Window 3
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-20] Test [2017-09-21 2017-10-11]
Window Train/Val/Test shape (1281, 10384) (441, 10384) (441, 10384)
Window Train/Val/Test shape (1281, 2) (441, 2) (441, 2)
X_train_cv  (1722, 10383)
3 Thresh=0.00989, n=1, mae: 8.358, mae: 0.1810
CV - Window 4
Rolling window to end date
Train [2017-07-01 2017-09-19] Val [2017-09-20 2017-10-10] Test [2017-10-11 2017-11-10]
Window Train/Val/Test shape (1701, 10384) (441, 10384) (651, 10384)
Window Train/Val/Test shape (1701, 2) (441, 2) (651, 2)
X_train_cv  (2142, 10383)
4 Thresh=0.00989, n=1, mae: 5.631, mae: 0.2471
CV - Window 5
Rolling window to end date
Train [2017-07-01 2017-10-09] Val [2017-10-10 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Te

In [130]:
orchestrator46.rcfe.summary[['mean_mse', 'mean_ev'] ]= pd.concat([orchestrator46.rcfe.summary[['mse3', 'mse4', 'mse5']].mean(axis=1), orchestrator46.rcfe.summary[['ev3', 'ev4', 'ev5']].mean(axis=1)], axis=1)

In [131]:
orchestrator46.rcfe.summary

Unnamed: 0,threshold,num_features,mae3,mae4,mae5,ev3,ev4,ev5,mean_mae,mean_ev
0,0.009887,1.0,8.358064,5.630944,4.78763,0.18103,0.247126,0.302143,6.258879,0.243433
1,0.008475,3.0,6.395759,3.649424,2.657888,0.517343,0.712385,0.843598,4.234357,0.691108
2,0.007062,8.0,5.553512,3.425821,2.363195,0.516421,0.761176,0.862259,3.780843,0.713285
3,0.00565,15.0,4.89293,3.832818,1.700311,0.667814,0.67099,0.950006,3.475353,0.762937
4,0.004237,37.0,3.470011,3.511399,1.543845,0.866206,0.754164,0.930907,2.841752,0.850426
5,0.002825,105.0,3.920425,2.472226,1.692473,0.821133,0.898222,0.923454,2.695041,0.880936
6,0.001412,539.0,4.621823,2.674335,1.502465,0.787615,0.884923,0.918445,2.932874,0.863661
7,0.0,10383.0,5.119204,3.085985,1.603084,0.708015,0.796815,0.920167,3.269424,0.808332


In [64]:
# orchestrator462 = Orchestrator(y4_6.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date'), 'week_4_to_6_cases')
# orchestrator462.tf = orchestrator46.tf
# orchestrator462.op = orchestrator46.op
# orchestrator462.corr_selected_features_bi_week = orchestrator46.corr_selected_features_bi_week
# orchestrator462.corr_selected_features_bi_week_y = orchestrator46.corr_selected_features_bi_week_y
# orchestrator462.rcfe = orchestrator46.rcfe

In [133]:
selected_features46 = ['date'] + list(orchestrator46.rcfe.mcr.feature_importance_df[orchestrator46.rcfe.mcr.feature_importance_df.threshold >= 0.002825].index)

In [134]:
orchestrator46.corr_selected_features_bi_week[selected_features46].to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_4_6_postselect.csv')

In [300]:
#orchestrator462.corr_selected_features_bi_week_y.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_4_6.csv')

In [143]:
# postselect_params_46 = {"alpha":0.575274916,

#                         "colsample_bytree":0.332949988,

#                         "eta":0.037464608,

#                         "gamma":0,

#                         "max_depth":12,

#                         "min_child_weight":1.356638629,

#                         "n_estimators":278,

#                         "rate_drop":0.3,

#                         "subsample":0.863770325}

postselect_params_46 = {"alpha":1.847137286,
                        "colsample_bytree":0.87622179,
                        "eta":0.329094107,
                        "gamma":4,
                        "max_depth":8,
                        "min_child_weight":5.439424743,
                        "n_estimators":160,
                        "subsample":0.644658011
                        }

In [144]:
orchestrator46.deploy(0.002825, postselect_params_46)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Creating baseline model to train on final features
XGBRegressor(alpha=1.847137286, base_score=0.5, booster='gbtree',
       colsample_bylevel=1, colsample_bytree=0.87622179, eta=0.329094107,
       gamma=4, learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=5.439424743, missing=None, n_estimators=160,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.644658011)
XGBRegressor(alpha=1.847137286, base_score=0.5, booster='gbtree',
       colsample_bylevel=1, colsample_bytree=0.87622179, eta=0.329094107,
       gamma=4, learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=5.439424743, missing=None, n_estimators=160,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.644658011)
y-test mean 5.3818261683246185, y-test st

In [49]:
# running feature selection for 6-8 week model

In [50]:
#orchestrator68 = Orchestrator(full_features, full_data_6_8, 'week_6_to_8_cases' )

In [145]:
# orchestrator68 = Orchestrator(full_features, full_data_6_8, 'week_6_to_8_cases' )
# orchestrator68.tf = orchestrator682.tf
# orchestrator68.op = orchestrator682.op
# orchestrator68.corr_selected_features_bi_week = orchestrator682.corr_selected_features_bi_week
# orchestrator68.corr_selected_features_bi_week_y = orchestrator682.corr_selected_features_bi_week_y
# orchestrator68.rcfe = orchestrator682.rcfe

In [51]:
#orchestrator68.runTsFresh()

Running TSFresh....
Selected features (3717, 15087)
Finished running TSFresh....


In [52]:
#orchestrator68.runOptimizeFeatures()

Running Feature Selection module ....
4719 features with a correlation magnitude greater than 0.97.

Original (3717, 15088) and after (3717, 10369)
Finished running Feature Selection ....


In [68]:
# X_68_preselect = orchestrator68.op.corr_selected_features
# y_68_preselect = orchestrator68.full_data_bi_week[['date', orchestrator68.y_bi_week_label]]

In [69]:
# X_68_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_68_preselect.csv')
# y_68_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_68_preselect.csv')

In [30]:
orchestrator68 = Orchestrator(y6_8.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date'), 'week_6_to_8_cases' )

In [31]:
X_68_preselect = pd.read_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_68_preselect.csv', index_col = 0)
X_68_preselect.date = pd.to_datetime(X_68_preselect.date, format = '%Y-%m-%d')

In [32]:
orchestrator68.performHyperparameterOptimization(X_68_preselect, 15)

Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-20] Test [2017-09-21 2017-10-11]
Window Train/Val/Test shape (1281, 10369) (441, 10369) (441, 10369)
Window Train/Val/Test shape (1281, 2) (441, 2) (441, 2)
Rolling window to end date
Train [2017-07-01 2017-09-19] Val [2017-09-20 2017-10-10] Test [2017-10-11 2017-11-10]
Window Train/Val/Test shape (1701, 10369) (441, 10369) (651, 10369)
Window Train/Val/Test shape (1701, 2) (441, 2) (651, 2)
Rolling window to end date
Train [2017-07-01 2017-10-09] Val [2017-10-10 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2121, 10369) (441, 10369) (231, 10369)
Window Train/Val/Test shape (2121, 2) (441, 2) (231, 2)
[46.56327138889367, 36.34992473023281, 13.653276627690333]
mean mse: 32.188824248938936
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-20] Test [2017-09-21 2017-10-11]
Window Train/Val/Test shape (1281, 10369) (441, 10369) (441, 10369)
Window Train/Val/Test shape (1281, 2) (441, 2) (441, 2)
Rolling window

Rolling window to end date
Train [2017-07-01 2017-10-09] Val [2017-10-10 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2121, 10369) (441, 10369) (231, 10369)
Window Train/Val/Test shape (2121, 2) (441, 2) (231, 2)
[70.72986385907501, 30.869673772583013, 24.713711219752593]
mean mse: 42.10441628380354
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-20] Test [2017-09-21 2017-10-11]
Window Train/Val/Test shape (1281, 10369) (441, 10369) (441, 10369)
Window Train/Val/Test shape (1281, 2) (441, 2) (441, 2)
Rolling window to end date
Train [2017-07-01 2017-09-19] Val [2017-09-20 2017-10-10] Test [2017-10-11 2017-11-10]
Window Train/Val/Test shape (1701, 10369) (441, 10369) (651, 10369)
Window Train/Val/Test shape (1701, 2) (441, 2) (651, 2)
Rolling window to end date
Train [2017-07-01 2017-10-09] Val [2017-10-10 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2121, 10369) (441, 10369) (231, 10369)
Window Train/Val/Test shape (2121, 2) (441,

In [43]:
orchestrator68.bo.best

{'colsample_bytree': 0.65,
 'gamma': 0.5,
 'learning_rate': 0.1,
 'max_depth': 0,
 'min_child_weight': 11.0,
 'n_estimators': 2,
 'subsample': 0.9500000000000001}

In [48]:
preselect_params_68 = {"alpha":1.170269464,
                        "colsample_bytree":0.677068393,
                        "early_stopping_rounds":100,
                        "eta":0.041417645,
                        "gamma":4,
                        "max_depth":12,
                        "min_child_weight":1.090424326,
                        "num_round":497,
                        "rate_drop":0.3,
                        "subsample":0.883495678,
                        "tweedie_variance_power":1.4 }

In [49]:
orchestrator68.runRollingCustomFeatureExtractor(preselect_params_68, X_68_preselect)

Running custom feature selection module ....
Creating baseline model to extract features
all features (2688, 10368)
Thresholds are 10368 16 [0.00951987 0.00827815 0.00620861 0.0057947  0.00496689 0.00413907
 0.00372517 0.00331126 0.00289735 0.00248344 0.00206954 0.00165563
 0.00124172 0.00082781 0.00041391 0.        ]
CV - Window 3
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-20] Test [2017-09-21 2017-10-11]
Window Train/Val/Test shape (1281, 10369) (441, 10369) (441, 10369)
Window Train/Val/Test shape (1281, 2) (441, 2) (441, 2)
X_train_cv  (1722, 10368)
3 Thresh=0.00952, n=1, mae: 8.738, mae: -0.0589
CV - Window 4
Rolling window to end date
Train [2017-07-01 2017-09-19] Val [2017-09-20 2017-10-10] Test [2017-10-11 2017-11-10]
Window Train/Val/Test shape (1701, 10369) (441, 10369) (651, 10369)
Window Train/Val/Test shape (1701, 2) (441, 2) (651, 2)
X_train_cv  (2142, 10368)
4 Thresh=0.00952, n=1, mae: 6.254, mae: -0.0137
CV - Window 5
Rolling window to end date
Train [2017-07

5 Thresh=0.00083, n=411, mae: 1.692, mae: 0.9037
{'threshold': 0.00082781457, 'num_features': 411, 'mae3': 4.127124130699362, 'mae4': 2.2995449462444704, 'mae5': 1.6920062896004426, 'ev3': 0.6082938895315184, 'ev4': 0.8110630134996691, 'ev5': 0.9037477929345253}


CV - Window 3
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-20] Test [2017-09-21 2017-10-11]
Window Train/Val/Test shape (1281, 10369) (441, 10369) (441, 10369)
Window Train/Val/Test shape (1281, 2) (441, 2) (441, 2)
X_train_cv  (1722, 10368)
3 Thresh=0.00041, n=1618, mae: 3.583, mae: 0.7643
CV - Window 4
Rolling window to end date
Train [2017-07-01 2017-09-19] Val [2017-09-20 2017-10-10] Test [2017-10-11 2017-11-10]
Window Train/Val/Test shape (1701, 10369) (441, 10369) (651, 10369)
Window Train/Val/Test shape (1701, 2) (441, 2) (651, 2)
X_train_cv  (2142, 10368)
4 Thresh=0.00041, n=1618, mae: 2.378, mae: 0.7847
CV - Window 5
Rolling window to end date
Train [2017-07-01 2017-10-09] Val [2017-10-10 2017-10-30] Test [2

In [50]:
orchestrator68.rcfe.summary[['mean_mse', 'mean_ev'] ]= pd.concat([orchestrator68.rcfe.summary[['mse3', 'mse4', 'mse5']].mean(axis=1), orchestrator68.rcfe.summary[['ev3', 'ev4', 'ev5']].mean(axis=1)], axis=1)

In [51]:
orchestrator68.rcfe.summary

Unnamed: 0,threshold,num_features,mae3,mae4,mae5,ev3,ev4,ev5,mean_mae,mean_ev
0,0.00952,1.0,8.738368,6.253639,6.049932,-0.058877,-0.013705,-0.02115,7.01398,-0.031244
1,0.008278,2.0,6.207214,5.775639,3.723473,0.456673,0.122991,0.51579,5.235442,0.365151
2,0.006209,3.0,4.06612,3.337662,3.575648,0.681538,0.559312,0.562568,3.65981,0.601139
3,0.005795,4.0,5.298627,3.501593,4.122421,0.504861,0.562068,0.519792,4.307547,0.528907
4,0.004967,5.0,4.892083,3.615827,2.827301,0.60365,0.544887,0.680071,3.778404,0.609536
5,0.004139,7.0,4.050899,3.437836,2.494554,0.665115,0.535847,0.801111,3.327763,0.667358
6,0.003725,9.0,3.539681,2.687224,1.987645,0.719657,0.748594,0.8716,2.738184,0.77995
7,0.003311,10.0,3.505623,2.697839,1.974624,0.717092,0.693033,0.869288,2.726028,0.759804
8,0.002897,16.0,2.785635,2.179557,2.033932,0.801186,0.804622,0.838561,2.333041,0.81479
9,0.002483,33.0,3.086556,2.162669,2.014989,0.791466,0.791566,0.82842,2.421405,0.803817


In [290]:
selected_features68 = ['date'] + list(orchestrator68.rcfe.mcr.feature_importance_df[orchestrator68.rcfe.mcr.feature_importance_df.threshold >= 0.002284].index)

In [291]:
orchestrator68.corr_selected_features_bi_week[selected_features68].to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_6_8.csv')

In [292]:
orchestrator68.corr_selected_features_bi_week_y.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_6_8.csv')

In [293]:
orchestrator68.deploy(0.002284)

Creating baseline model to train on final features
y-test mean 4.671585825506454, y-test std 5.5720761674820904
xgb mae 3.1162140401978666, xgb ev 0.5350739582444819
linreg mae 4.027808065496075, linreg ev 0.1196188209004242


In [104]:
orchestrator68.corr_selected_features_bi_week_y.sort_values(by='date')

Unnamed: 0,date,week_6_to_8_cases
2456,2017-06-19,7.143574
2441,2017-06-19,49.379311
2444,2017-06-19,17.993787
2443,2017-06-19,44.854502
2442,2017-06-19,3.873987
2446,2017-06-19,22.996410
2439,2017-06-19,28.036947
2438,2017-06-19,70.382174
2437,2017-06-19,10.251431
2445,2017-06-19,10.268290


In [None]:
import sagemaker                                  # Amazon SageMaker's Python SDK provides many helper functions
from sagemaker.predictor import csv_serializer    # Converts strings for HTTP POST requests on inference

from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

prefix = 'sagemaker/xgboost-ts'

In [None]:
deployRegressor = True
if deployRegressor == True:
    X = corr_selected_features_4_6[selected_features]
    y = corr_selected_features_4_6_y 
else:
    X = corr_selected_features_4_6 
    y = corr_selected_features_4_6_y 
    
X_train, X_cv, X_test = train_cv_test_split(X)
y_train, y_cv, y_test = train_cv_test_split(y)

In [35]:
X_train.head()

In [None]:
y_train.head()

In [None]:
pd.concat([y_train['week_4_to_6_cases'], X_train], axis=1).to_csv('train.csv', index=False, header=False)
pd.concat([y_cv['week_4_to_6_cases'], X_cv], axis=1).to_csv('validation.csv', index=False, header=False)
pd.concat([y_test['week_4_to_6_cases'], X_test], axis=1).to_csv('test.csv', index=False, header=False)


In [None]:
# Upload for Sagemaker jobs to pickup
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

In [None]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')
print('s3://{}/{}/train'.format(bucket, prefix))

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

sess = sagemaker.Session()

container = get_image_uri(region, 'xgboost')

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)

In [None]:
# Static parameters
# 'auc'
# eval_metric: Metric 'multi:softmax' is not supported. 
# Parameter 'eval_metric' should be one of these options:'rmse', 'mae', 'logloss', 'error', 'merror', 'mlogloss', 'auc', 'ndcg', 'map', 'ndcg-', 'map-', 'poisson-nloglik', 'gamma-nloglik', 'gamma-deviance', 'tweedie-nloglik'.
xgb.set_hyperparameters(eval_metric='mae', 
                        max_depth=3,
                        eta=0.1,
                        gamma=4,
                        min_child_weight=5,
                        subsample=0.9,
                        colsample_bytree=0.8,
                        silent=0,
                        objective='reg:linear',
                        num_round=500,
                        early_stopping_rounds=100,
                        rate_drop=0.3,
                        tweedie_variance_power=1.4)

#max_depth=[3,5,7]
#subsample=[0.7,0.8,0.9],
#colsample_bytree=[0.7,0.8,0.9],
hyperparameter_ranges = {
    'eta': ContinuousParameter(0.0, 1.0),
    'min_child_weight': ContinuousParameter(1.0, 10.0),
    'alpha': ContinuousParameter(0, 2),
    'max_depth': IntegerParameter(3, 12),
    'subsample': ContinuousParameter(0.3, 0.9),
    'colsample_bytree': ContinuousParameter(0.3, 0.9),
    'num_round': IntegerParameter(50, 1000)
}

In [None]:
#https://sagemaker.readthedocs.io/en/latest/tuner.html
tuner = HyperparameterTuner(xgb,
                            objective_metric_name = 'validation:mae',
                            objective_type = 'Minimize',
                            strategy='Bayesian',
                            hyperparameter_ranges = hyperparameter_ranges,
                            max_jobs=20,
                            max_parallel_jobs=5)

In [None]:
# Fit with train & validation data sets
tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [None]:
#Let's just run a quick check of the hyperparameter tuning jobs status to make sure it started successfully.
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

In [None]:
#Deploy the best trained model
xgb_predictor = tuner.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

In [None]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(X_test.as_matrix()[:, 1:])
len(predictions)

In [None]:
print('Test sample X {} y {} Predictions {}'.format(X_test.shape, y_test.shape, len(predictions)))

In [None]:
mean_squared_error(y_test['week_4_to_6_cases'], predictions)

In [None]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)

In [190]:
bo.run()

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 38) (336, 38) (336, 38)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 38) (336, 38) (336, 38)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 38) (336, 38) (336, 38)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 38) (336, 38) (546, 38)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 38) (336, 38) (231