In [1]:
# import pandas as pd

# import boto3
# import re
# import os
# import sagemaker
# from sagemaker import get_execution_role

# bucket = sagemaker.Session().default_bucket()        
# region = boto3.Session().region_name    
# role = get_execution_role()
# bucket

In [2]:
import pandas as pd
import numpy as np
from tsfresh import select_features
import xgboost
import seaborn as sns
import matplotlib.pyplot as plt

import math, datetime 


from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
# from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from feature_selection import FeatureSelector

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

%matplotlib

Using matplotlib backend: Qt5Agg


In [4]:
# inputBucket='cmg-kudzu-text-analysis' 

# data_key = 'glue/test/exhaustive_extracted_features.pkl'
# exhaustive_extracted_features_s3file = '{}/{}'.format(inputBucket, data_key)

# data_key = 'glue/test/y_df_for_feature_selection.pkl'
# y_df_for_feature_selection_s3file = '{}/{}'.format(inputBucket, data_key)

# data_key = 'glue/test/full_features.csv'
# original_full_features_s3file = '{}/{}'.format(inputBucket, data_key)

# print('Location {} {}'.format(exhaustive_extracted_features_s3file,y_df_for_feature_selection_s3file))

In [5]:
# import s3fs
# import pandas as pd
# fs = s3fs.S3FileSystem(anon=False)

# # Pickle
# with fs.open(exhaustive_extracted_features_s3file) as f:
#     full_features = pd.read_pickle(f)

# with fs.open(y_df_for_feature_selection_s3file) as f:
#     y_df = pd.read_pickle(f)

# # with fs.open(original_full_features_s3file) as f:
# #     original_full_features = pd.read_csv(f)


In [3]:
# all extracted features from tsfresh
# the extracted features are the same for all models
full_features = pd.read_pickle('/Users/Rohil/Documents/iGEM/yemen/exhaustive_extracted_features_new.pkl')

# test values for different time frames (in a nice format)
y_df = pd.read_pickle('/Users/Rohil/Documents/iGEM/yemen/y_df_for_feature_selection_new.pkl')

In [4]:
full_features = pd.concat([full_features, pd.get_dummies(full_features.gov_iso)], axis=1)

In [5]:
# ignoring anything that happens before Jul 1, as there's not enough data for feature calculation
full_features = full_features[full_features.date>='2017-07-01'].reset_index(drop=True)
y_df = y_df[y_df.date>='2017-07-01'].reset_index(drop=True)

In [6]:
ESTIMATORS = 750
EARLY_STOPPING_ROUNDS = 50

In [7]:
def scale_features(scaler, X):

    continuous_cols = [col for col in X.columns if 'YE-' not in col]
    
    X_scaled = X
    
    X_scaled[continuous_cols] = scaler.transform(X[continuous_cols])
    
    return (X_scaled)

In [8]:
def train_cv_test_split(data, cv_split_date='2017-09-17', test_split_date='2017-11-06'):
    
    data_train, data_cv, data_test = data[(data.date<cv_split_date)], data[(data.date<test_split_date) & (data.date>=cv_split_date)], data[ (data.date>=test_split_date) ]
    
    return (data_train.drop('date', axis=1), data_cv.drop('date', axis=1), data_test.drop('date', axis=1))

In [9]:
HOLD_OUT_WINDOW = 100
ROLLING_WINDOW_SIZE = 45
WINDOW = round(ROLLING_WINDOW_SIZE/3)

def getMaxDate():
    return y_df.date.max() - datetime.timedelta(days=HOLD_OUT_WINDOW)

def getMinDate():
    return y_df.date.min()

delta = getMaxDate() - getMinDate()
print('Start {} Stop {} No of days {}'.format(getMinDate(), getMaxDate(), delta.days))

number_rolling_windows = math.ceil(round(delta.days/WINDOW))  - 1
number_rolling_windows


Start 2017-07-01 00:00:00 Stop 2017-11-10 00:00:00 No of days 132


8

In [10]:
def getHoldOutDate():
    start = getMaxDate() + datetime.timedelta(days=1)
    end   = y_df.date.max() 
    
    return (start, end)

In [11]:
def getRollingWindowDates(idx):
    maxDate = getMaxDate()  
    minDate = getMinDate() 
    
    trainStart = minDate
    trainStop  = minDate + datetime.timedelta(days=WINDOW*idx)
    
    validationStart  = trainStop + datetime.timedelta(days=1)
    validationStop   = validationStart + datetime.timedelta(days=WINDOW)
    
    testStart  = validationStop + datetime.timedelta(days=1)
    testStop   = testStart + datetime.timedelta(days=WINDOW)
    
    if (maxDate - testStop).days < WINDOW:
        print('Rolling window to end date')
        testStop = maxDate
        
    print('Train [{} {}] Val [{} {}] Test [{} {}]'.format(trainStart.date(), trainStop.date(), 
                                                       validationStart.date(), validationStop.date(), 
                                                       testStart.date(), testStop.date()))
    
    return (trainStart, trainStop, validationStart, validationStop, testStart, testStop)

In [12]:
def getRollingWindow(data, trainStart, trainStop, validationStart, validationStop, testStart, testStop):
    
    train = data[(data.date >= trainStart) & (data.date <= trainStop)]
    val   = data[(data.date >= validationStart) & (data.date <= validationStop)]
    test  = data[(data.date >= testStart) & (data.date <= testStop)] 
    
    print('Window Train/Val/Test shape {} {} {}'.format(train.shape, val.shape, test.shape))
    
    return (train.drop('date', axis=1), val.drop('date', axis=1), test.drop('date', axis=1))

In [13]:
def getHoldOutData(data):
    minDate = getMinDate() 
    start, end = getHoldOutDate()

    train = data[(data.date >= minDate) & (data.date < start)]
    test =  data[(data.date >= start) & (data.date <= end)]
    
    return (train.drop('date', axis=1), test.drop('date', axis=1))

In [14]:
number_rolling_windows, round((ROLLING_WINDOW_SIZE)/3), getHoldOutDate()

(8, 15, (Timestamp('2017-11-11 00:00:00'), Timestamp('2018-02-18 00:00:00')))

In [15]:
class TsFresh():
    def __init__(self):
        pass
    
    def postProcessor(self, X, y, dateSeries):
        self.selected_features = select_features(X, y, fdr_level=0.001)
        print('Selected features {}'.format(self.selected_features.shape))      
        self.selected_features = pd.concat([dateSeries, self.selected_features], axis=1)

In [16]:
class OptimizeFeatures():
    def __init__(self):
        pass
        
    def selectFeatures(self, X, y):
        self.X  = X
        self.y  = y
        self.fs = FeatureSelector(data = X, labels = y)        
    
    def identifyCollinearFeatures(self, correlation_threshold=0.975):
        self.fs.identify_collinear(correlation_threshold)
    
    def collinerFeaturesColumnsToKeep(self):
        return self.fs.ops['collinear']
    
    def removeCollinerFeatures(self):
        self.cols_to_keep = set(self.X.columns) - set(self.fs.ops['collinear'])
        self.corr_selected_features = self.X[list(self.cols_to_keep)]


In [17]:
class ModelCustomRegressor():
    
    def __init__(self):
        pass
    
    def extract(self, model_params, X_train, X_cv, X_test, y_train, y_cv, y_test):
        print('Creating baseline model to extract features')
        
        X_train_cv = X_train.append(X_cv)
        y_train_cv = y_train.append(y_cv)
        
        scaler = StandardScaler()
        continuous_cols = [col for col in X_train_cv.columns if 'YE-' not in col]
        scaler.fit(X_train_cv[continuous_cols])

        X_train_cv, X_test = scale_features(scaler, X_train_cv), scale_features(scaler, X_test)
        
        print('all features {}'.format(X_train_cv.shape))
            
        eval_set = [(X_test, y_test)]
    
        self.feature_importance_df = pd.DataFrame(index = X_train.columns)
        
        self.regressor = xgboost.XGBRegressor(**model_params)                                      
        
        self.regressor.fit(X_train_cv, y_train_cv, eval_metric='rmse', 
                                  eval_set=eval_set, 
                                  early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=False)

        preds = self.regressor.predict((X_test))
        
        self.feature_importance_df['threshold'] = self.regressor.feature_importances_
        
        self.thresholds = np.unique(self.regressor.feature_importances_)
        self.thresholds.sort();
        print (len(self.thresholds))
        self.thresholds = self.thresholds[::-1][:50]
        
        print('# features {} # thresholds {}; thresholds:{}'.format(len(self.regressor.feature_importances_), len(self.thresholds), self.thresholds))

In [18]:
class RollingWindowCrossValidation():
    
    def __init__(self, corr_selected_features_bi_week, corr_selected_features_bi_week_y, preselect_params):
        self.corr_selected_features_bi_week = corr_selected_features_bi_week
        self.corr_selected_features_bi_week_y   = corr_selected_features_bi_week_y
        self.preselect_params = preselect_params
        pass
    
    def extract(self, regressor, thresh):
        
        mse_list = []
        
        for idx in range(3, number_rolling_windows):
            print('CV - Window {}'.format(idx))
            trainStart, trainStop, validationStart, validationStop, testStart, testStop = getRollingWindowDates(idx)
            
            X_train, X_cv, X_test = getRollingWindow(self.corr_selected_features_bi_week, 
                                                     trainStart, trainStop, 
                                                     validationStart, validationStop, testStart, testStop)
            y_train, y_cv, y_test = getRollingWindow(self.corr_selected_features_bi_week_y, 
                                                     trainStart, trainStop, 
                                                     validationStart, validationStop, testStart, testStop)
            
            X_train_cv = X_train.append(X_cv)
            y_train_cv = y_train.append(y_cv)
            
            scaler = StandardScaler()
            continuous_cols = [col for col in X_train_cv.columns if 'YE-' not in col]
            scaler.fit(X_train_cv[continuous_cols])

            X_train_cv, X_test = scale_features(scaler, X_train_cv), scale_features(scaler, X_test)
        
            print('X_train_cv  {}'.format(X_train_cv.shape))
            
            # select features using threshold
            selection = SelectFromModel(regressor, threshold=thresh, prefit=True)
            select_X_train_cv = selection.transform(X_train_cv)
            
            select_X_test = selection.transform(X_test)
            
            eval_set = [(select_X_test, y_test)]
            
            # train model
            selection_model = xgboost.XGBRegressor(**self.preselect_params)
            
            selection_model.fit(select_X_train_cv, y_train_cv, eval_metric='rmse', 
                                      eval_set=eval_set, 
                                      early_stopping_rounds=50,
                                      verbose=False)
            # eval model
            y_pred = selection_model.predict(select_X_test)
           
            mse = mean_squared_error(y_test, y_pred)

            print("%d Thresh=%.5f, n=%d, mse: %.3f" % (idx, thresh, select_X_train_cv.shape[1], mse))
            
            mse_list.append(mse)
        
        return_dict = {'threshold':thresh, 'num_features':select_X_train_cv.shape[1], 'mse_list':mse_list, 'mean_mse':np.mean(mse_list)}
        print (return_dict)
        return (return_dict)
        

In [19]:
class RollingCustomFeatureExtractor():
    
    def __init__(self, corr_selected_features_bi_week, corr_selected_features_bi_week_y, preselect_params):
        self.corr_selected_features_bi_week  = corr_selected_features_bi_week
        self.corr_selected_features_bi_week_y   = corr_selected_features_bi_week_y
        self.preselect_params = preselect_params
        pass
    
    def extract(self):
        X = self.corr_selected_features_bi_week 
        y = self.corr_selected_features_bi_week_y 
        
        X_train, X_cv, X_test = train_cv_test_split(X)
        y_train, y_cv, y_test = train_cv_test_split(y)
        
        self.mcr = ModelCustomRegressor()
        self.mcr.extract(self.preselect_params, X_train, X_cv, X_test, y_train, y_cv, y_test) 
            
        thresholds = self.mcr.thresholds        
    
        self.rwcv = RollingWindowCrossValidation(self.corr_selected_features_bi_week, 
                                                 self.corr_selected_features_bi_week_y,
                                                 self.preselect_params)
        
        self.summary = pd.DataFrame(columns = ['threshold', 'num_features', 'mse_list', 'mean_mse'])
        
        for thresh in thresholds:
            return_dict = self.rwcv.extract(self.mcr.regressor, thresh)
            
            self.summary = self.summary.append(return_dict, ignore_index = True)
            
            print("\n")
            
        print(self.summary.head(20))


In [20]:
def set_style(color):
    plt.style.use(['seaborn-' + color, 'seaborn-paper'])

In [68]:
class BayesianOptimizer():

    def __init__(self, corr_selected_features_bi_week, corr_selected_features_bi_week_y, max_evals):
        self.corr_selected_features_bi_week  = corr_selected_features_bi_week
        self.corr_selected_features_bi_week_y   = corr_selected_features_bi_week_y
        self.max_evals = max_evals
    
    def objective(self, space):
    
        mse_list = []

        for idx in range(3, number_rolling_windows):

            trainStart, trainStop, validationStart, validationStop, testStart, testStop = getRollingWindowDates(idx)

            X_train, X_cv, X_test = getRollingWindow(self.corr_selected_features_bi_week, 
                                                     trainStart, trainStop, 
                                                     validationStart, validationStop, testStart, testStop)
            y_train, y_cv, y_test = getRollingWindow(self.corr_selected_features_bi_week_y, 
                                                     trainStart, trainStop, 
                                                     validationStart, validationStop, testStart, testStop)

            X_train_cv = X_train.append(X_cv)
            y_train_cv = y_train.append(y_cv)

            scaler = StandardScaler()
            continuous_cols = [col for col in X_train_cv.columns if 'YE-' not in col]
            scaler.fit(X_train_cv[continuous_cols])

            X_train_cv, X_test = scale_features(scaler, X_train_cv), scale_features(scaler, X_test)

            xgb = xgboost.XGBRegressor(n_estimators = int(space['n_estimators']),
                           max_depth = int(space['max_depth']),
                           min_child_weight = space['min_child_weight'],
                           subsample = space['subsample'],
                           learning_rate = space['learning_rate'],
                           gamma = space['gamma'],
                           colsample_bytree = space['colsample_bytree'],
                           objective='reg:linear', n_jobs = -1
                           )    

            xgb.fit(X_train_cv ,y_train_cv, eval_metric = 'rmse')

            # eval model
            y_pred = xgb.predict(X_test)

            mse = mean_squared_error(y_test, y_pred)    

            mse_list.append(mse)
            
        print(mse_list)    

        return_dict = {'mse_list':mse_list}

        return_dict['loss'] = np.mean(mse_list)
        return_dict['status'] = STATUS_OK

        print ("mean mse:", return_dict['loss'])

        return (return_dict)

    def run(self):
        
        self.space ={'max_depth': hp.quniform('max_depth', 4, 12, 1),
                'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
                'subsample':  hp.quniform('subsample', 0.5, 1, 0.05),
                'n_estimators' : hp.quniform('n_estimators', 50, 750, 50),
                'learning_rate' : hp.quniform('learning_rate', 0.01, 0.3, 0.025),
                'gamma' : hp.quniform('gamma', 0, 5, 0.5),
                'colsample_bytree' : hp.quniform('colsample_bytree', 0.3, 1, 0.05)}
        
        self.trials = Trials()
        self.best = fmin(fn=self.objective,
                    space=self.space,
                    algo=tpe.suggest,
                    max_evals=self.max_evals, 
                    trials=self.trials)



In [175]:
def plot_pred_against_actual(cv_pred_crosstab, test_pred_crosstab, true_crosstab, y_label, cv_mse, holdout_mse, sharey):
    
    set_style('white')
    
    fig, ax = plt.subplots(21,1,figsize = (6,15), sharex=True, sharey = sharey)
    
    cols = true_crosstab.columns
        
    for i in range(0,21):
        
        true_crosstab[cols[i]].plot(kind='line', ax = ax[i], label = 'true_val', legend = True, color = 'red')

        ax[i].set_prop_cycle('color', ['seagreen', 'blue'])
                            
        #train_pred_crosstab[cols[i]].plot(kind='line', ax = ax[i], label= 'xgboost train-pred',linestyle= '-.', legend = True)
        
        cv_pred_crosstab[cols[i]].plot(kind='line', ax = ax[i], label= 'xgboost cv-pred;\nmean error: %.3f'%(np.sqrt(cv_mse)) ,linestyle= '-.', legend = True)

        test_pred_crosstab[cols[i]].plot(kind='line', ax = ax[i], label= 'xgboost holdout-pred;\nmean error: %.3f'%(np.sqrt(holdout_mse)), linestyle= '-.', legend = True)   
          
        ax[i].legend().set_visible(False)
        ax[i].set_ylabel(cols[i], rotation=0)
        ax[i].yaxis.set_label_position('right')
        ax[i].spines['right'].set_visible(False)
        ax[i].spines['top'].set_visible(False)
        ax[i].spines['bottom'].set_visible(True)
        
    ax[10].legend().set_visible(True)
    ax[10].legend(fontsize=10, loc='center left', bbox_to_anchor=(1.05, 0.5))
 
    fig.subplots_adjust(hspace = .2)    
        
    if sharey:
    
        fig.savefig('/Users/Rohil/Documents/iGEM/yemen/final_results/' + y_label + '_deployed_sharey.png', dpi = 500, bbox_inches = 'tight')
    
    else:
        
        fig.savefig('/Users/Rohil/Documents/iGEM/yemen/final_results/' + y_label + '_deployed.png', dpi = 500, bbox_inches = 'tight')

    plt.close('all')

In [176]:
def plot_pred_against_actual_filtered(cv_pred_crosstab, test_pred_crosstab, true_crosstab, y_label, cv_mse, holdout_mse, sharey):
    
    set_style('white')
    
    fig, ax = plt.subplots(15,1,figsize = (6,15), sharex=True, sharey = sharey)
    
    cols = list(set(true_crosstab.columns) - set(['YE-HD-AL', 'YE-MR', 'YE-SH', 'YE-TA', 'YE-MA', 'YE-SD']))

        
    for i in range(0,15):
        
        true_crosstab[cols[i]].plot(kind='line', ax = ax[i], label = 'true_val', legend = True, color = 'red')

        ax[i].set_prop_cycle('color', ['seagreen', 'blue'])
                            
        #train_pred_crosstab[cols[i]].plot(kind='line', ax = ax[i], label= 'xgboost train-pred',linestyle= '-.', legend = True)
        
        cv_pred_crosstab[cols[i]].plot(kind='line', ax = ax[i], label= 'xgboost cv-pred;\nmean error: %.3f'%(np.sqrt(cv_mse)) ,linestyle= '-.', legend = True)

        test_pred_crosstab[cols[i]].plot(kind='line', ax = ax[i], label= 'xgboost holdout-pred;\nmean error: %.3f'%(np.sqrt(holdout_mse)), linestyle= '-.', legend = True)   
          
        ax[i].legend().set_visible(False)
        ax[i].set_ylabel(cols[i], rotation=0)
        ax[i].yaxis.set_label_position('right')
        ax[i].spines['right'].set_visible(False)
        ax[i].spines['top'].set_visible(False)
        ax[i].spines['bottom'].set_visible(True)
        
    ax[7].legend().set_visible(True)
    ax[7].legend(fontsize=10, loc='center left', bbox_to_anchor=(1.05, 0.5))
 
    fig.subplots_adjust(hspace = .2)    
        
    if sharey:
    
        fig.savefig('/Users/Rohil/Documents/iGEM/yemen/final_results/' + y_label + '_deployed_sharey_filtered.png', dpi = 500, bbox_inches = 'tight')
    
    else:
        
        fig.savefig('/Users/Rohil/Documents/iGEM/yemen/final_results/' + y_label + '_deployed_filtered.png', dpi = 500, bbox_inches = 'tight')

    plt.close('all')

In [172]:
class DeployRegressor():
    
    def __init__(self):
        pass
    
    def execute(self, model_params, X, y, y_to_plot, y_bi_week_label):

        X_train_all, X_hold_test = getHoldOutData(X)
        y_train_all, y_hold_test = getHoldOutData(y)
        
        mse_list = []
        y_cv_preds = []
        y_cv_all = pd.DataFrame(columns = [y_bi_week_label])

        for idx in range(3, number_rolling_windows):

            trainStart, trainStop, validationStart, validationStop, testStart, testStop = getRollingWindowDates(idx)

            X_train, X_cv, X_test = getRollingWindow(X, 
                                                     trainStart, trainStop, 
                                                     validationStart, validationStop, testStart, testStop)
            y_train, y_cv, y_test = getRollingWindow(y, 
                                                     trainStart, trainStop, 
                                                     validationStart, validationStop, testStart, testStop)

            X_train_cv = X_train.append(X_cv)
            y_train_cv = y_train.append(y_cv)
            
#             if idx == 3:
#                 X_base_train = X_train_cv
#                 y_base_train = y_train_cv
            
            scaler = StandardScaler()
            continuous_cols = [col for col in X_train_cv.columns if 'YE-' not in col]
            scaler.fit(X_train_cv[continuous_cols])

            X_train_cv, X_test = scale_features(scaler, X_train_cv), scale_features(scaler, X_test)

            xgb = xgboost.XGBRegressor(**model_params)    

            xgb.fit(X_train_cv ,y_train_cv, eval_metric = 'rmse')

            # eval model
            y_pred = xgb.predict(X_test)
            
            y_cv_preds.extend(list(y_pred))
            y_cv_all = y_cv_all.append(y_test)

        
        end_xgb = xgboost.XGBRegressor(**model_params)    

        end_xgb.fit(X_train_all, y_train_all, eval_metric = 'rmse')
        
        y_holdout_preds = end_xgb.predict(X_hold_test)
        
#         y_base_train_preds = end_xgb.predict(X_base_train)
#         base_train_mse = mean_squared_error(y_base_train.values, y_base_train_preds)
        
        cv_mse = mean_squared_error(y_cv_all.values, y_cv_preds)
        holdout_mse = mean_squared_error(y_hold_test.values, y_holdout_preds)
                        
        print('y-test mean {}, y-test std {}'.format(np.mean(y_test.values), np.std(y_test.values)))
        print('cv mse {}, holdout mse{}'.format(cv_mse, holdout_mse))
        
#         y_base_train_pred_df = pd.DataFrame(y_base_train_preds, columns=[y_bi_week_label], index = y_base_train.index)
#         y_base_train_pred_df = y_base_train_pred_df.merge(y_to_plot[['gov_iso', 'date']],  how = 'left',left_index = True, right_index = True)
#         y_base_train_pred_crosstab = y_base_train_pred_df.pivot_table(index = 'date', columns = 'gov_iso', values = y_bi_week_label, aggfunc='sum')

        
        y_cv_pred_df = pd.DataFrame(y_cv_preds, columns=[y_bi_week_label], index = y_cv_all.index)
        y_cv_pred_df = y_cv_pred_df.merge(y_to_plot[['gov_iso', 'date']],  how = 'left',left_index = True, right_index = True)
        y_cv_pred_crosstab = y_cv_pred_df.pivot_table(index = 'date', columns = 'gov_iso', values = y_bi_week_label, aggfunc='mean')
        
        y_holdout_pred_df = pd.DataFrame(y_holdout_preds, columns=[y_bi_week_label], index = y_hold_test.index)
        y_holdout_pred_df = y_holdout_pred_df.merge(y_to_plot[['gov_iso', 'date']], how = 'left', left_index = True, right_index = True)
        y_holdout_pred_crosstab = y_holdout_pred_df.pivot_table(index = 'date', columns = 'gov_iso', values = y_bi_week_label, aggfunc='mean')
                
        true_val_pivot = y_to_plot.pivot_table(index = 'date', columns = 'gov_iso', values = y_bi_week_label, aggfunc='mean')
        
        #print (y_base_train_pred_df.gov_iso.nunique(), y_cv_pred_df, y_holdout_pred_df)

        
        plot_pred_against_actual(y_cv_pred_crosstab, y_holdout_pred_crosstab, true_val_pivot, y_bi_week_label, cv_mse, holdout_mse, sharey = True)
        plot_pred_against_actual(y_cv_pred_crosstab, y_holdout_pred_crosstab, true_val_pivot, y_bi_week_label, cv_mse, holdout_mse, sharey = False)
        
        plot_pred_against_actual_filtered(y_cv_pred_crosstab, y_holdout_pred_crosstab, true_val_pivot, y_bi_week_label, cv_mse, holdout_mse, sharey = False)
        plot_pred_against_actual_filtered(y_cv_pred_crosstab, y_holdout_pred_crosstab, true_val_pivot, y_bi_week_label, cv_mse, holdout_mse, sharey = False)
        
        y_cv_pred_crosstab.append(y_holdout_pred_crosstab).to_csv('/Users/Rohil/Documents/iGEM/yemen/' + y_bi_week_label + '_deployed_cv_holdout_preds.csv')
        
        fig1, ax1 = plt.subplots(figsize = (5,8))
        xgboost.plot_importance(end_xgb, ax=ax1)
        fig1.savefig('/Users/Rohil/Documents/iGEM/yemen/' + y_bi_week_label + '_deployed_feature_importances.png', dpi = 300, bbox_inches = 'tight')
        plt.close('all')


    


In [24]:
class Orchestrator():
    
    def __init__(self, full_data_bi_week, y_bi_week_label):
        self.full_data_bi_week = full_data_bi_week
        self.y_bi_week_label   = y_bi_week_label
        pass
    
    def runTsFresh(self):
        print('Running TSFresh....')
        
        X_ts = self.full_data_bi_week.drop(columns=['date', 'gov_iso', y_bi_week_label])
        y_ts = self.full_data_bi_week[self.y_bi_week_label]
        dateSeries_ts = self.full_data_bi_week.date
        
        self.tf = TsFresh()
        self.tf.postProcessor(X_ts, y_ts, dateSeries_ts)
        
        print('Finished running TSFresh....')
        
    def runOptimizeFeatures(self):
        print('Running Feature Selection module ....')
        
        y = self.full_data_bi_week[self.y_bi_week_label]
            
        self.op = OptimizeFeatures()
        self.op.selectFeatures(self.tf.selected_features, y)
        self.op.identifyCollinearFeatures(0.975)
        
        self.op.removeCollinerFeatures()
        print('Original {} and after {}'.format(self.op.X.shape, self.op.corr_selected_features.shape))
        
        print('Finished running Feature Selection ....')
        
    def performHyperparameterOptimization(self, X, max_evals):
        self.bo = BayesianOptimizer(X, self.full_data_bi_week[['date', self.y_bi_week_label]], max_evals)
        self.bo.run()
    
    def runRollingCustomFeatureExtractor(self, preselect_params, corr_selected_features):
        print('Running custom feature selection module ....')
        
        # user can specify their corr_selected_features if they please (if the job has been batched)
        if corr_selected_features is None:
            self.corr_selected_features_bi_week = self.op.corr_selected_features
        else:
            self.corr_selected_features_bi_week = corr_selected_features
        
        self.corr_selected_features_bi_week_y = self.full_data_bi_week[['date', self.y_bi_week_label]]

        #corr_selected_features_4_6 = pd.concat([full_data_4_6.date, corr_selected_features_4_6], axis=1)
        #corr_selected_features_4_6.date.head()

        """
        X = corr_selected_features_4_6 
        y = corr_selected_features_4_6_y 
        
        X_train, X_cv, X_test = train_cv_test_split(X)
        y_train, y_cv, y_test = train_cv_test_split(y)
        """

        self.rcfe = RollingCustomFeatureExtractor(self.corr_selected_features_bi_week, 
                                                  self.corr_selected_features_bi_week_y, preselect_params)
        self.rcfe.extract()

    def deploy(self, threshold, model_params):
        
        #selected_features_from_threshold = list(self.rcfe.mcr.feature_importance_df[self.rcfe.mcr.feature_importance_df.threshold >= 0.003145].index)
        selected_features_from_threshold = list(self.rcfe.mcr.feature_importance_df[self.rcfe.mcr.feature_importance_df.threshold >= threshold].index)
        selected_features_from_threshold.append('date')

        self.X_final = self.corr_selected_features_bi_week[selected_features_from_threshold]
        y = self.corr_selected_features_bi_week_y 
        
        self.X_final['days_from'] = self.full_data_bi_week.days_from

        y_to_plot = self.corr_selected_features_bi_week_y.merge(full_features, on = 'date', left_index=True, right_index=True, how = 'left')[[self.y_bi_week_label, 'gov_iso', 'date']]
                
        self.dr = DeployRegressor()
        self.dr.execute(model_params, self.X_final, y, y_to_plot, self.y_bi_week_label)
        

In [25]:
print (y_df.date.min())
print (y_df.date.max())

2017-07-01 00:00:00
2018-02-18 00:00:00


In [26]:
y1_2 = y_df[['date', 'gov_iso', 'week_1_to_2_cases']]
y2_4 = y_df[['date', 'gov_iso', 'week_2_to_4_cases']]
y4_6 = y_df[['date', 'gov_iso', 'week_4_to_6_cases']]
y6_8 = y_df[['date', 'gov_iso', 'week_6_to_8_cases']]

In [27]:
# instead of creating copies here, will run these lines directly in the object instantiation
# full_data_1_2 = y1_2.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date')
# full_data_2_4 = y2_4.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date')
# full_data_4_6 = y4_6.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date')
# full_data_6_8 = y6_8.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date')

In [28]:
# running feature selection for 1-2 week model

In [38]:
# orchestrator12 = Orchestrator(full_features, full_data_1_2, 'week_1_to_2_cases' )

In [39]:
# orchestrator12.runTsFresh()

Running TSFresh....
Selected features (4599, 15252)
Finished running TSFresh....


In [40]:
# orchestrator12.runOptimizeFeatures()

Running Feature Selection module ....
4825 features with a correlation magnitude greater than 0.97.

Original (4599, 15253) and after (4599, 10428)
Finished running Feature Selection ....


In [62]:
# X_12_preselect = orchestrator12.op.corr_selected_features
# y_12_preselect = orchestrator12.full_data_bi_week[['date', orchestrator12.y_bi_week_label]]

In [63]:
# X_12_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_12_preselect.csv')
# y_12_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_12_preselect.csv')

In [29]:
orchestrator12 = Orchestrator(y1_2.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date'), 'week_1_to_2_cases' )

In [32]:
X_12_preselect = pd.read_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_12_preselect.csv', index_col = 0)
X_12_preselect.date = pd.to_datetime(X_12_preselect.date, format = '%Y-%m-%d')

In [170]:
orchestrator12.performHyperparameterOptimization(X_12_preselect, 25)

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10428) (336, 10428) (336, 10428)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)


KeyboardInterrupt: 

In [30]:
preselect_params_12 = {'colsample_bytree': 0.60,
                        'gamma': 2.0,
                        'learning_rate': 0.05,
                        'max_depth': 11,
                        'min_child_weight': 10.0,
                        'n_estimators': 50,
                        'subsample': 0.8}


In [33]:
orchestrator12.runRollingCustomFeatureExtractor(preselect_params_12, X_12_preselect)

Running custom feature selection module ....
Creating baseline model to extract features
all features (2688, 10427)
19
# features 10427 # thresholds 19; thresholds:[0.01544021 0.01051248 0.00722733 0.00689882 0.00591327 0.00492773
 0.0042707  0.00361367 0.00328515 0.00295664 0.00262812 0.00229961
 0.00197109 0.00164258 0.00131406 0.00098555 0.00065703 0.00032852
 0.        ]
CV - Window 3
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10428) (336, 10428) (336, 10428)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
X_train_cv  (1302, 10427)
3 Thresh=0.01544, n=2, mse: 149.084
CV - Window 4
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 10428) (336, 10428) (336, 10428)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
X_train_cv  (1617, 10427)
4 Thresh=0.01544, n=2, mse: 124.931
CV - Window 5
Train [2017-07-01 2017-09-14] Val [2

X_train_cv  (2247, 10427)
6 Thresh=0.00591, n=6, mse: 14.961
CV - Window 7
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 10428) (336, 10428) (231, 10428)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
X_train_cv  (2562, 10427)
7 Thresh=0.00591, n=6, mse: 9.915
{'threshold': 0.005913272, 'num_features': 6, 'mse_list': [143.68633116899045, 124.36296975381886, 43.580654344093254, 14.961303468884507, 9.914548359752263], 'mean_mse': 67.30116141910787}


CV - Window 3
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10428) (336, 10428) (336, 10428)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
X_train_cv  (1302, 10427)
3 Thresh=0.00493, n=7, mse: 146.995
CV - Window 4
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 10428) (336, 1

Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 10428) (336, 10428) (546, 10428)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
X_train_cv  (2247, 10427)
6 Thresh=0.00296, n=19, mse: 26.690
CV - Window 7
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 10428) (336, 10428) (231, 10428)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
X_train_cv  (2562, 10427)
7 Thresh=0.00296, n=19, mse: 32.084
{'threshold': 0.002956636, 'num_features': 19, 'mse_list': [134.0550972030669, 84.78690099572086, 32.25120946837106, 26.68973445366726, 32.08396458228555], 'mean_mse': 61.973381340622325}


CV - Window 3
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10428) (336, 10428) (336, 10428)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)


Window Train/Val/Test shape (1596, 10428) (336, 10428) (336, 10428)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
X_train_cv  (1932, 10427)
5 Thresh=0.00131, n=97, mse: 34.498
CV - Window 6
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 10428) (336, 10428) (546, 10428)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
X_train_cv  (2247, 10427)
6 Thresh=0.00131, n=97, mse: 20.733
CV - Window 7
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 10428) (336, 10428) (231, 10428)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
X_train_cv  (2562, 10427)
7 Thresh=0.00131, n=97, mse: 13.617
{'threshold': 0.0013140604, 'num_features': 97, 'mse_list': [134.37292251067038, 78.93178654972893, 34.49822091036245, 20.732943041742697, 13.617278377993225], 'mean_mse': 56.43063027809

18  61.021351  


In [34]:
orchestrator12.rcfe.summary

Unnamed: 0,threshold,num_features,mse_list,mean_mse
0,0.01544,2,"[149.083527043454, 124.93147620655877, 49.9980...",71.315811
1,0.010512,3,"[149.52148992973295, 125.67727159292792, 45.85...",70.52371
2,0.007227,4,"[148.98804438037072, 123.6824049157656, 41.576...",68.614627
3,0.006899,5,"[144.75932550919464, 124.57481877299821, 41.95...",67.485668
4,0.005913,6,"[143.68633116899045, 124.36296975381886, 43.58...",67.301161
5,0.004928,7,"[146.99511150122953, 126.94747737237012, 43.87...",68.616957
6,0.004271,9,"[151.31624305439107, 133.0165576389683, 42.335...",75.743225
7,0.003614,11,"[142.57007415876288, 134.79152742615184, 41.90...",73.616776
8,0.003285,13,"[147.86202167204647, 109.87446865163263, 32.58...",69.042226
9,0.002957,19,"[134.0550972030669, 84.78690099572086, 32.2512...",61.973381


In [35]:
selected_features12 = ['date'] + list(orchestrator12.rcfe.mcr.feature_importance_df[orchestrator12.rcfe.mcr.feature_importance_df.threshold >= 0.001643].index)

In [143]:
orchestrator12.performHyperparameterOptimization(X_12_preselect[selected_features12], 100)

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 39) (336, 39) (546, 39)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 39) (336, 39) (231

Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 39) (336, 39) (231, 39)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[120.2318300453442, 75.25740480503505, 29.631467060923892, 14.999769787628592, 14.666435397476373]
mean mse: 50.95738141928162
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-

Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 39) (336, 39) (546, 39)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 39) (336, 39) (231, 39)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[116.89116065841442, 98.08878449465499, 34.082713252226355, 17.786512028511186, 7.794412211557411]
mean mse: 54.92871652907288
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 39) (336, 39) (336, 39)
Window Train/

Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 39) (336, 39) (546, 39)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 39) (336, 39) (231, 39)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[1221.4747544990453, 946.0202819161093, 504.6565290411965, 324.13823796560195, 291.370343515604]
mean mse: 657.5320293875113
Train [2017-0

[1221.4747544990453, 946.0202819161093, 504.6565290411965, 324.13823796560195, 291.370343515604]
mean mse: 657.5320293875113
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 39) (336, 39) (546, 39)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-

Window Train/Val/Test shape (1911, 39) (336, 39) (546, 39)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 39) (336, 39) (231, 39)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[1221.4747544990453, 946.0202819161093, 504.6565290411965, 324.13823796560195, 291.370343515604]
mean mse: 657.5320293875113
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 39) (33

Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 39) (336, 39) (546, 39)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 39) (336, 39) (231, 39)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[1221.4747544990453, 946.0202819161093, 504.6565290411965, 324.13823796560195, 291.370343515604]
mean mse: 657.5320293875113
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 39) (336, 39) (336, 39)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 39) (336, 39) (546, 39)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 39) (336, 39) (231

In [144]:
orchestrator12.bo.best

{'colsample_bytree': 0.75,
 'gamma': 1.0,
 'learning_rate': 0.05,
 'max_depth': 6.0,
 'min_child_weight': 1.0,
 'n_estimators': 450.0,
 'subsample': 0.9}

In [36]:
postselect_params_12 = {'colsample_bytree': 0.75,
 'gamma': 1.0,
 'learning_rate': 0.05,
 'max_depth': 6,
 'min_child_weight': 1.0,
 'n_estimators': 450,
 'subsample': 0.9}

In [66]:
orchestrator12.corr_selected_features_bi_week_y.merge(full_features, left_index=True, right_index=True, how = 'left')[[orchestrator12.y_bi_week_label, 'gov_iso', 'date']]

KeyError: "['date'] not in index"

In [68]:
y12[y12.gov_iso.isnull()].date.unique()

array(['2017-07-11T00:00:00.000000000', '2017-07-22T00:00:00.000000000',
       '2017-08-02T00:00:00.000000000', '2017-08-05T00:00:00.000000000',
       '2017-08-06T00:00:00.000000000', '2017-08-07T00:00:00.000000000',
       '2017-08-08T00:00:00.000000000', '2017-08-09T00:00:00.000000000',
       '2017-08-10T00:00:00.000000000', '2017-08-11T00:00:00.000000000',
       '2017-08-12T00:00:00.000000000', '2017-08-13T00:00:00.000000000',
       '2017-08-14T00:00:00.000000000', '2017-08-15T00:00:00.000000000',
       '2017-08-16T00:00:00.000000000', '2017-08-18T00:00:00.000000000',
       '2017-08-29T00:00:00.000000000', '2017-08-31T00:00:00.000000000',
       '2017-09-01T00:00:00.000000000', '2017-09-02T00:00:00.000000000',
       '2017-09-13T00:00:00.000000000', '2017-09-24T00:00:00.000000000',
       '2017-10-05T00:00:00.000000000', '2017-10-16T00:00:00.000000000',
       '2017-10-27T00:00:00.000000000', '2017-11-07T00:00:00.000000000',
       '2017-11-18T00:00:00.000000000', '2017-11-29

In [177]:
orchestrator12.deploy(0.001643, postselect_params_12)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 40) (336, 40) (336, 40)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 40) (336, 40) (336, 40)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 40) (336, 40) (336, 40)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 40) (336, 40) (546, 40)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 40) (336, 40) (231

In [41]:
# running feature selection for 2-4 week model

In [43]:
#orchestrator24.runTsFresh()

Running TSFresh....
Selected features (4305, 15146)
Finished running TSFresh....


In [44]:
#orchestrator24.runOptimizeFeatures()

Running Feature Selection module ....
4810 features with a correlation magnitude greater than 0.97.

Original (4305, 15147) and after (4305, 10337)
Finished running Feature Selection ....


In [64]:
# X_24_preselect = orchestrator24.op.corr_selected_features
# y_24_preselect = orchestrator24.full_data_bi_week[['date', orchestrator24.y_bi_week_label]]

In [65]:
# X_24_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_24_preselect.csv')
# y_24_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_24_preselect.csv')

In [49]:
orchestrator24 = Orchestrator(y2_4.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date'), 'week_2_to_4_cases' )

In [50]:
X_24_preselect = pd.read_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_24_preselect.csv', index_col = 0)
X_24_preselect.date = pd.to_datetime(X_24_preselect.date, format = '%Y-%m-%d')

In [83]:
orchestrator24.performHyperparameterOptimization(X_24_preselect, 30)

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10337) (336, 10337) (336, 10337)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 10337) (336, 10337) (336, 10337)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 10337) (336, 10337) (336, 10337)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 10337) (336, 10337) (546, 10337)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/

Window Train/Val/Test shape (1911, 10337) (336, 10337) (546, 10337)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 10337) (336, 10337) (231, 10337)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[120.1662462639382, 118.17704091344673, 36.10507768097646, 23.519708815021364, 16.024347785060304]
mean mse: 62.79848429168861
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10337) (336, 10337) (336, 10337)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 10337) (336, 10337) (336, 10337)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Wind

Window Train/Val/Test shape (1281, 10337) (336, 10337) (336, 10337)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 10337) (336, 10337) (336, 10337)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 10337) (336, 10337) (546, 10337)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 10337) (336, 10337) (231, 10337)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[109.22358942027475, 93.45216619537848, 34.82277186778052, 20.092146486803387, 13.75400768485361]
mean mse: 54.26893633101815
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test 

KeyboardInterrupt: 

In [108]:
(orchestrator24.bo.trials.trials)

{'state': 2,
 'tid': 0,
 'spec': None,
 'result': {'mse_list': [111.61804677229122,
   108.63686685614721,
   53.073593370926204,
   28.869864997078118,
   18.623377713928964],
  'loss': 64.16434994207435,
  'status': 'ok'},
 'misc': {'tid': 0,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'colsample_bytree': [0],
   'gamma': [0],
   'learning_rate': [0],
   'max_depth': [0],
   'min_child_weight': [0],
   'n_estimators': [0],
   'subsample': [0]},
  'vals': {'colsample_bytree': [0.9],
   'gamma': [3.5],
   'learning_rate': [0.15000000000000002],
   'max_depth': [7.0],
   'min_child_weight': [4.0],
   'n_estimators': [150.0],
   'subsample': [1.0]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2018, 10, 8, 3, 0, 11, 113000),
 'refresh_time': datetime.datetime(2018, 10, 8, 3, 23, 18, 258000)}

In [95]:
bo24_results = pd.DataFrame(orchestrator24.bo.trials.trials)

In [123]:
bo24_results.result.values

array([{'mse_list': [111.61804677229122, 108.63686685614721, 53.073593370926204, 28.869864997078118, 18.623377713928964], 'loss': 64.16434994207435, 'status': 'ok'},
       {'mse_list': [125.85589131565195, 103.88363074152076, 61.34991068150878, 23.852292118844478, 26.611452893365485], 'loss': 68.31063555017829, 'status': 'ok'},
       {'mse_list': [147.0163785979502, 91.15533527256345, 50.2186958380491, 29.128293281205863, 19.460440185701177], 'loss': 67.39582863509396, 'status': 'ok'},
       {'mse_list': [126.54086698285643, 99.23713390377426, 44.11120983600989, 29.088407374018836, 14.659380566847034], 'loss': 62.72739973270129, 'status': 'ok'},
       {'mse_list': [124.69332437490482, 96.17936221280804, 41.040066790648076, 23.289188752986323, 11.765182663206648], 'loss': 59.393424958910785, 'status': 'ok'},
       {'mse_list': [140.3683612131413, 109.52284409031196, 54.47561418704822, 28.498423832780215, 17.67778675768], 'loss': 70.10860601619234, 'status': 'ok'},
       {'mse_list

In [129]:
[i for i in bo24_results.result.values][10]

{'mse_list': [118.6659095708028,
  63.067792083583385,
  41.052858203249016,
  24.44213454381191,
  14.121152314250711],
 'loss': 52.269969343139564,
 'status': 'ok'}

In [131]:
bo24_results.misc.iloc[0]

{'tid': 0,
 'cmd': ('domain_attachment', 'FMinIter_Domain'),
 'workdir': None,
 'idxs': {'colsample_bytree': [0],
  'gamma': [0],
  'learning_rate': [0],
  'max_depth': [0],
  'min_child_weight': [0],
  'n_estimators': [0],
  'subsample': [0]},
 'vals': {'colsample_bytree': [0.9],
  'gamma': [3.5],
  'learning_rate': [0.15000000000000002],
  'max_depth': [7.0],
  'min_child_weight': [4.0],
  'n_estimators': [150.0],
  'subsample': [1.0]}}

In [132]:
preselect_params_24_sagemaker = {"alpha":0.370220603,
                        "colsample_bytree":0.313402452,
                        "early_stopping_rounds":100,
                        "eta":0.010141415,
                        "gamma":4,
                        "max_depth":10,
                        "min_child_weight":7.460119372,
                        "num_round":666,
                        "objective":"reg:linear",
                        "rate_drop":0.3,
                        "silent":1,
                        "subsample":0.617982585,
                        "tweedie_variance_power":1.4}

# preselect_params_24 = {'colsample_bytree': 0.30000000000000004,
#                          'gamma': 1.5,
#                          'learning_rate': 0.125,
#                          'max_depth': 9,
#                          'min_child_weight': 3.0,
#                          'n_estimators': 700,
#                          'subsample': 0.55}

preselect_params_24 = {'colsample_bytree': 0.9,
                      'gamma': 3.5,
                      'learning_rate': 0.15,
                      'max_depth': 7,
                      'min_child_weight': 4.0,
                      'n_estimators': 150,
                      'subsample': 1.0}

In [134]:
orchestrator24.runRollingCustomFeatureExtractor(preselect_params_24, X_24_preselect)

Running custom feature selection module ....
Creating baseline model to extract features
all features (2688, 10336)
12
# features 10336 # thresholds 12; thresholds:[0.01227679 0.00892857 0.00669643 0.00613839 0.00446429 0.00390625
 0.00279018 0.00223214 0.00167411 0.00111607 0.00055804 0.        ]
CV - Window 3
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10337) (336, 10337) (336, 10337)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
X_train_cv  (1302, 10336)
3 Thresh=0.01228, n=1, mse: 251.105
CV - Window 4
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 10337) (336, 10337) (336, 10337)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
X_train_cv  (1617, 10336)
4 Thresh=0.01228, n=1, mse: 81.223
CV - Window 5
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (

Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 10337) (336, 10337) (231, 10337)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
X_train_cv  (2562, 10336)
7 Thresh=0.00446, n=8, mse: 14.675
{'threshold': 0.004464286, 'num_features': 8, 'mse_list': [136.94249832492858, 86.116811940815, 33.41235906489256, 22.572414984495314, 14.67454968324519], 'mean_mse': 58.74372679967532}


CV - Window 3
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10337) (336, 10337) (336, 10337)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
X_train_cv  (1302, 10336)
3 Thresh=0.00391, n=12, mse: 104.745
CV - Window 4
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 10337) (336, 10337) (336, 10337)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
X_

Window Train/Val/Test shape (1911, 10337) (336, 10337) (546, 10337)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
X_train_cv  (2247, 10336)
6 Thresh=0.00112, n=272, mse: 17.593
CV - Window 7
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 10337) (336, 10337) (231, 10337)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
X_train_cv  (2562, 10336)
7 Thresh=0.00112, n=272, mse: 7.078
{'threshold': 0.0011160715, 'num_features': 272, 'mse_list': [54.86282586406053, 48.70211355769078, 44.600514801959186, 17.59252020592548, 7.077724802636396], 'mean_mse': 34.567139846454474}


CV - Window 3
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10337) (336, 10337) (336, 10337)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
X_train_cv  (1302, 10336)
3 Thresh=0.00056, n=1334, mse: 59.520
CV - Window 4
Train

In [135]:
orchestrator24.rcfe.summary

Unnamed: 0,threshold,num_features,mse_list,mean_mse
0,0.012277,1,"[251.10475750511748, 81.22296992591856, 58.965...",88.216409
1,0.008929,2,"[228.42751571715976, 75.16984463184995, 64.328...",83.613553
2,0.006696,4,"[223.28589980212664, 74.9333470974012, 81.6546...",86.789896
3,0.006138,6,"[231.3345894234506, 82.29263470739556, 82.7760...",92.830703
4,0.004464,8,"[136.94249832492858, 86.116811940815, 33.41235...",58.743727
5,0.003906,12,"[104.74463328628194, 54.755929280578975, 27.44...",44.963914
6,0.00279,14,"[71.06707665541056, 50.63736819605383, 26.1060...",35.153096
7,0.002232,28,"[57.1084370262155, 41.3008093922616, 20.896051...",29.246528
8,0.001674,76,"[58.40954728673179, 49.47227697274227, 30.9987...",32.208214
9,0.001116,272,"[54.86282586406053, 48.70211355769078, 44.6005...",34.56714


In [136]:
selected_features24 = ['date'] + list(orchestrator24.rcfe.mcr.feature_importance_df[orchestrator24.rcfe.mcr.feature_importance_df.threshold >= 0.002232].index)

In [137]:
orchestrator24.performHyperparameterOptimization(X_24_preselect[selected_features24], 100)

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 29) (336, 29) (546, 29)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 29) (336, 29) (231

[56.142475202391424, 58.54837675084619, 20.47056851539754, 19.361070664672823, 19.101768384263533]
mean mse: 34.724851903514306
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 29) (336, 29) (546, 29)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-

Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 29) (336, 29) (231, 29)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[52.58794517784207, 57.98000231005614, 16.746808622878852, 19.435915762141438, 20.925747278521094]
mean mse: 33.53528383028792
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-

Window Train/Val/Test shape (1911, 29) (336, 29) (546, 29)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 29) (336, 29) (231, 29)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[69.97900187573657, 48.66361775666506, 34.177243503089066, 24.380514575932068, 19.51320447092034]
mean mse: 39.34271643646862
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 29) (3

Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 29) (336, 29) (546, 29)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 29) (336, 29) (231, 29)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[71.9279968603582, 55.00529555976099, 20.13116437735047, 19.129944910019894, 23.67441472401715]
mean mse: 37.97376328630134
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 29) (336, 29) (336, 29)
Window Train/Val

Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 29) (336, 29) (546, 29)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 29) (336, 29) (231, 29)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[62.20841785092426, 53.33983351534971, 20.328939782488586, 22.528209859072597, 23.22630164838344]
mean mse: 36.32634053124372
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07

Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 29) (336, 29) (546, 29)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 29) (336, 29) (231, 29)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[65.72708814520028, 60.449223876282176, 22.69457452856961, 20.235627202759737, 17.031850664851277]
mean mse: 37.22767288353261
Train [2017

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 29) (336, 29) (546, 29)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 29) (336, 29) (231

[62.49790774744759, 62.253096335004514, 19.674337295759784, 19.681664441229106, 27.85834397178694]
mean mse: 38.39306995824559
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 29) (336, 29) (546, 29)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-0

Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 29) (336, 29) (231, 29)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[64.6344236066613, 50.73089607128618, 23.97519325122897, 16.037535785259276, 22.323004824361814]
mean mse: 35.540210707759506
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-0

Window Train/Val/Test shape (1911, 29) (336, 29) (546, 29)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 29) (336, 29) (231, 29)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[73.05799310875769, 56.33807331722249, 25.878767707871216, 28.64052717726264, 27.252653920368136]
mean mse: 42.23360304629644
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 29) (3

Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 29) (336, 29) (546, 29)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 29) (336, 29) (231, 29)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[68.99913534610971, 56.38055688737387, 23.185359882347942, 18.931627499141673, 21.405115926130268]
mean mse: 37.78035910822069
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 29) (336, 29) (336, 29)
Window Train/

Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 29) (336, 29) (546, 29)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 29) (336, 29) (231, 29)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[56.886426147483334, 57.41209235253646, 22.707259641597624, 21.512724452610975, 18.45300565633185]
mean mse: 35.39430165011204
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-0

Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 29) (336, 29) (546, 29)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 29) (336, 29) (231, 29)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[78.30403589646923, 62.768310073829376, 27.112013169910362, 21.561026166292137, 32.066108128877595]
mean mse: 44.36229868707575
Train [201

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 29) (336, 29) (336, 29)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 29) (336, 29) (546, 29)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 29) (336, 29) (231

In [138]:
orchestrator24.bo.best

{'colsample_bytree': 0.7000000000000001,
 'gamma': 2.5,
 'learning_rate': 0.2,
 'max_depth': 7.0,
 'min_child_weight': 1.0,
 'n_estimators': 550.0,
 'subsample': 0.7000000000000001}

In [139]:
# postselect_params_24 = {'colsample_bytree': 0.45,
#                          'gamma': 4.0,
#                          'learning_rate': 0.05,
#                          'max_depth': 4,
#                          'min_child_weight': 2.0,
#                          'n_estimators': 750,
#                          'subsample': 0.95}

postselect_params_24 = {'colsample_bytree': 0.70,
                         'gamma': 2.5,
                         'learning_rate': 0.2,
                         'max_depth': 7,
                         'min_child_weight': 1.0,
                         'n_estimators': 550,
                         'subsample': 0.70}

In [295]:
orchestrator24.corr_selected_features_bi_week[selected_features24].to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_2_4_postselect.csv')

In [296]:
orchestrator24.corr_selected_features_bi_week_y.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_2_4.csv')

In [178]:
orchestrator24.deploy(0.002232, postselect_params_24)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 30) (336, 30) (336, 30)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 30) (336, 30) (336, 30)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 30) (336, 30) (336, 30)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 30) (336, 30) (546, 30)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 30) (336, 30) (231

In [45]:
# running feature selection for 4-6 week model

In [46]:
#orchestrator46 = Orchestrator(full_features, full_data_4_6, 'week_4_to_6_cases' )

In [47]:
# orchestrator46.runTsFresh()

Running TSFresh....
Selected features (4011, 15156)
Finished running TSFresh....


In [48]:
# orchestrator46.runOptimizeFeatures()

Running Feature Selection module ....
4773 features with a correlation magnitude greater than 0.97.

Original (4011, 15157) and after (4011, 10384)
Finished running Feature Selection ....


In [66]:
# X_46_preselect = orchestrator46.op.corr_selected_features
# y_46_preselect = orchestrator46.full_data_bi_week[['date', orchestrator46.y_bi_week_label]]

In [67]:
# X_46_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_46_preselect.csv')
# y_46_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_46_preselect.csv')

In [54]:
orchestrator46 = Orchestrator(y4_6.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date'), 'week_4_to_6_cases' )

In [55]:
X_46_preselect = pd.read_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_46_preselect.csv', index_col = 0)
X_46_preselect.date = pd.to_datetime(X_46_preselect.date, format = '%Y-%m-%d')

In [None]:
orchestrator46.performHyperparameterOptimization(X_46_preselect, 25)

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10384) (336, 10384) (336, 10384)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 10384) (336, 10384) (336, 10384)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 10384) (336, 10384) (336, 10384)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 10384) (336, 10384) (546, 10384)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/

In [56]:
preselect_params_46_sagemaker = { "alpha": 1.566422688,
                        "colsample_bytree": 0.679700695,
                        "eta": 0.062136353,
                        "gamma": 4,
                        "max_depth": 10,
                        "min_child_weight": 9.214395021,
                        "n_estimators": 600,
                        "rate_drop": 0.3,
                        "subsample": 0.891818256,
                        "tweedie_variance_power": 1.4 
                        }

preselect_params_46 = {'colsample_bytree': 0.35,
                        'gamma': 4.0,
                        'learning_rate': 0.125,
                        'max_depth': 11,
                        'min_child_weight': 2.0,
                        'n_estimators': 350,
                        'subsample': 0.95}


In [57]:
orchestrator46.runRollingCustomFeatureExtractor(preselect_params_46, X_46_preselect)

Running custom feature selection module ....
Creating baseline model to extract features
all features (2688, 10383)
13
# features 10383 # thresholds 13; thresholds:[0.00783771 0.00507146 0.00461042 0.00414938 0.00368834 0.00322729
 0.00276625 0.00230521 0.00184417 0.00138313 0.00092208 0.00046104
 0.        ]
CV - Window 3
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10384) (336, 10384) (336, 10384)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
X_train_cv  (1302, 10383)
3 Thresh=0.00784, n=1, mse: 96.011
CV - Window 4
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 10384) (336, 10384) (336, 10384)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
X_train_cv  (1617, 10383)
4 Thresh=0.00784, n=1, mse: 84.528
CV - Window 5
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/T

Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 10384) (336, 10384) (231, 10384)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
X_train_cv  (2562, 10383)
7 Thresh=0.00369, n=10, mse: 9.527
{'threshold': 0.0036883357, 'num_features': 10, 'mse_list': [50.332832509640696, 36.48606757250619, 37.381766917227935, 9.648404722009701, 9.527214850366652], 'mean_mse': 28.675257314350233}


CV - Window 3
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10384) (336, 10384) (336, 10384)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
X_train_cv  (1302, 10383)
3 Thresh=0.00323, n=13, mse: 48.110
CV - Window 4
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 10384) (336, 10384) (336, 10384)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2

Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 10384) (336, 10384) (546, 10384)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
X_train_cv  (2247, 10383)
6 Thresh=0.00138, n=115, mse: 8.783
CV - Window 7
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 10384) (336, 10384) (231, 10384)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
X_train_cv  (2562, 10383)
7 Thresh=0.00138, n=115, mse: 5.787
{'threshold': 0.0013831259, 'num_features': 115, 'mse_list': [29.10207104571129, 44.97539924161602, 18.50317251635974, 8.783044314656212, 5.787344531037369], 'mean_mse': 21.430206329876125}


CV - Window 3
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10384) (336, 10384) (336, 10384)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2

In [65]:
orchestrator46.rcfe.summary

Unnamed: 0,threshold,num_features,mse_list,mean_mse
0,0.007838,1,"[96.01124097957597, 84.52831216896432, 87.7036...",62.467202
1,0.005071,2,"[72.5939388033789, 77.06552081754683, 61.03384...",55.471513
2,0.00461,5,"[71.3112827201162, 60.22254825770118, 72.07660...",51.593909
3,0.004149,8,"[49.50330081621364, 45.71297779305811, 50.5905...",34.488944
4,0.003688,10,"[50.332832509640696, 36.48606757250619, 37.381...",28.675257
5,0.003227,13,"[48.10976106493251, 33.212752818892035, 28.678...",26.661191
6,0.002766,19,"[32.438236808872034, 34.92581332507123, 24.064...",22.738295
7,0.002305,25,"[28.15466641078222, 35.17952183346277, 20.7024...",20.04641
8,0.001844,49,"[26.47072905100062, 39.94320993309955, 15.0901...",19.018926
9,0.001383,115,"[29.10207104571129, 44.97539924161602, 18.5031...",21.430206


In [64]:
# orchestrator462 = Orchestrator(y4_6.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date'), 'week_4_to_6_cases')
# orchestrator462.tf = orchestrator46.tf
# orchestrator462.op = orchestrator46.op
# orchestrator462.corr_selected_features_bi_week = orchestrator46.corr_selected_features_bi_week
# orchestrator462.corr_selected_features_bi_week_y = orchestrator46.corr_selected_features_bi_week_y
# orchestrator462.rcfe = orchestrator46.rcfe

In [66]:
selected_features46 = ['date'] + list(orchestrator46.rcfe.mcr.feature_importance_df[orchestrator46.rcfe.mcr.feature_importance_df.threshold >= 0.001844].index)

In [134]:
orchestrator46.corr_selected_features_bi_week[selected_features46].to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_4_6_postselect.csv')

In [69]:
orchestrator46.performHyperparameterOptimization(X_46_preselect[selected_features46], 100)

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 50) (336, 50) (546, 50)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 50) (336, 50) (231

[40.947549014148784, 44.149933796481704, 9.30794213104691, 20.87795720721483, 4.773806487027536]
mean mse: 24.011437727183953
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 50) (336, 50) (546, 50)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07

Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 50) (336, 50) (231, 50)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[36.15453052817052, 50.58751176287239, 14.036072756515258, 15.677892501840095, 5.528601460526408]
mean mse: 24.39692180198493
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-0

Window Train/Val/Test shape (1911, 50) (336, 50) (546, 50)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 50) (336, 50) (231, 50)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[37.15578523961182, 35.26502670495269, 13.55356826750791, 13.161129920667378, 4.450677205866637]
mean mse: 20.717237467721286
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 50) (3

Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 50) (336, 50) (546, 50)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 50) (336, 50) (231, 50)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[28.715759025941832, 37.933975704625674, 10.506206857269294, 14.615122373424041, 3.949711031637072]
mean mse: 19.14415499857958
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 50) (336, 50) (336, 50)
Window Train

Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 50) (336, 50) (546, 50)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 50) (336, 50) (231, 50)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[36.54348281496981, 44.76538289954845, 11.538034828599491, 14.028571907391028, 3.3802531302230134]
mean mse: 22.05114511614636
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-0

Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 50) (336, 50) (546, 50)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 50) (336, 50) (231, 50)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[35.26516322226437, 39.218413690133595, 13.38948586832398, 17.244275002214696, 3.597068026065459]
mean mse: 21.74288116180042
Train [2017-

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 50) (336, 50) (546, 50)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 50) (336, 50) (231

Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[44.253360810395975, 30.93419705814174, 14.081079314819217, 13.663457691079651, 4.411130792130755]
mean mse: 21.468645133313466
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 50) (336, 50) (546, 50)
Window Train/Val/Test shape (1911, 2) (

Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 50) (336, 50) (231, 50)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[31.475143665087266, 42.328136897678796, 14.649853368938869, 13.736485886015824, 3.311673990977205]
mean mse: 21.100258761739592
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [201

Window Train/Val/Test shape (1911, 50) (336, 50) (546, 50)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 50) (336, 50) (231, 50)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[27.853518092763743, 32.56248486031864, 14.502137850007026, 10.185312187699644, 4.226822152454095]
mean mse: 17.86605502864863
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 50) (

Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 50) (336, 50) (546, 50)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 50) (336, 50) (231, 50)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[33.563606079047744, 39.765235716675114, 13.797361803164145, 13.816629265060605, 4.4728847245497665]
mean mse: 21.083143517699476
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 50) (336, 50) (336, 50)
Window Tra

Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 50) (336, 50) (546, 50)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 50) (336, 50) (231, 50)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[539.6409815294328, 355.77831089297484, 283.344378522231, 125.59562762197143, 105.15474783392483]
mean mse: 281.90280928010696
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-0

Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 50) (336, 50) (546, 50)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 50) (336, 50) (231, 50)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[539.6409815294328, 355.77831089297484, 283.344378522231, 125.59562762197143, 105.15474783392483]
mean mse: 281.90280928010696
Train [2017

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 50) (336, 50) (336, 50)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 50) (336, 50) (546, 50)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 50) (336, 50) (231

In [70]:
orchestrator46.bo.best

{'colsample_bytree': 0.6000000000000001,
 'gamma': 3.5,
 'learning_rate': 0.025,
 'max_depth': 9.0,
 'min_child_weight': 8.0,
 'n_estimators': 100.0,
 'subsample': 0.7000000000000001}

In [300]:
#orchestrator462.corr_selected_features_bi_week_y.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_4_6.csv')

In [71]:
# postselect_params_46 = {'colsample_bytree': 0.55,
#                          'gamma': 3.5,
#                          'learning_rate': 0.075,
#                          'max_depth': 6,
#                          'min_child_weight': 2.0,
#                          'n_estimators': 150,
#                          'subsample': 0.75}

# postselect_params_46 = {'colsample_bytree': 0.85,
#                          'gamma': 4.0,
#                          'learning_rate': 0.025,
#                          'max_depth': 12,
#                          'min_child_weight': 1.0,
#                          'n_estimators': 100,
#                          'subsample': 0.55}

postselect_params_46 = {'colsample_bytree': 0.60,
                         'gamma': 3.5,
                         'learning_rate': 0.025,
                         'max_depth': 9,
                         'min_child_weight': 8.0,
                         'n_estimators': 100,
                         'subsample': 0.70}


In [179]:
orchestrator46.deploy(0.001844, postselect_params_46)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 51) (336, 51) (336, 51)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 51) (336, 51) (336, 51)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 51) (336, 51) (336, 51)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 51) (336, 51) (546, 51)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 51) (336, 51) (231

In [49]:
# running feature selection for 6-8 week model

In [50]:
#orchestrator68 = Orchestrator(full_features, full_data_6_8, 'week_6_to_8_cases' )

In [145]:
# orchestrator68 = Orchestrator(full_features, full_data_6_8, 'week_6_to_8_cases' )
# orchestrator68.tf = orchestrator682.tf
# orchestrator68.op = orchestrator682.op
# orchestrator68.corr_selected_features_bi_week = orchestrator682.corr_selected_features_bi_week
# orchestrator68.corr_selected_features_bi_week_y = orchestrator682.corr_selected_features_bi_week_y
# orchestrator68.rcfe = orchestrator682.rcfe

In [51]:
#orchestrator68.runTsFresh()

Running TSFresh....
Selected features (3717, 15087)
Finished running TSFresh....


In [52]:
#orchestrator68.runOptimizeFeatures()

Running Feature Selection module ....
4719 features with a correlation magnitude greater than 0.97.

Original (3717, 15088) and after (3717, 10369)
Finished running Feature Selection ....


In [68]:
# X_68_preselect = orchestrator68.op.corr_selected_features
# y_68_preselect = orchestrator68.full_data_bi_week[['date', orchestrator68.y_bi_week_label]]

In [69]:
# X_68_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_68_preselect.csv')
# y_68_preselect.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_68_preselect.csv')

In [73]:
orchestrator68 = Orchestrator(y6_8.dropna().merge(full_features, how = 'left', on = ['gov_iso', 'date']).sort_values('date'), 'week_6_to_8_cases' )

In [74]:
X_68_preselect = pd.read_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_68_preselect.csv', index_col = 0)
X_68_preselect.date = pd.to_datetime(X_68_preselect.date, format = '%Y-%m-%d')

In [108]:
orchestrator68.performHyperparameterOptimization(X_68_preselect, 20)

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10369) (336, 10369) (336, 10369)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 10369) (336, 10369) (336, 10369)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 10369) (336, 10369) (336, 10369)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 10369) (336, 10369) (546, 10369)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/

Window Train/Val/Test shape (1281, 10369) (336, 10369) (336, 10369)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 10369) (336, 10369) (336, 10369)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 10369) (336, 10369) (546, 10369)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 10369) (336, 10369) (231, 10369)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[43.957924715230675, 31.25903012556706, 22.037100089478308, 11.4338612140947, 11.694577860095723]
mean mse: 24.076498800893297
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test

In [120]:
orchestrator68.bo.best

{'colsample_bytree': 0.45,
 'gamma': 3.0,
 'learning_rate': 0.15000000000000002,
 'max_depth': 7.0,
 'min_child_weight': 10.0,
 'n_estimators': 250.0,
 'subsample': 0.7000000000000001}

In [75]:
preselect_params_68_sagemaker = {"alpha":1.170269464,
                                "colsample_bytree":0.677068393,
                                "early_stopping_rounds":100,
                                "eta":0.041417645,
                                "gamma":4,
                                "max_depth":12,
                                "min_child_weight":1.090424326,
                                "num_round":497,
                                "rate_drop":0.3,
                                "subsample":0.883495678,
                                "tweedie_variance_power":1.4 }

preselect_params_68= {'colsample_bytree': 0.45,
                         'gamma': 3.0,
                         'learning_rate': 0.152,
                         'max_depth': 7,
                         'min_child_weight': 10.0,
                         'n_estimators': 250,
                         'subsample': 0.70}

In [76]:
orchestrator68.runRollingCustomFeatureExtractor(preselect_params_68, X_68_preselect)

Running custom feature selection module ....
Creating baseline model to extract features
all features (2688, 10368)
10
# features 10368 # thresholds 10; thresholds:[0.00726392 0.0066586  0.00423729 0.00363196 0.00302663 0.00242131
 0.00181598 0.00121065 0.00060533 0.        ]
CV - Window 3
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10369) (336, 10369) (336, 10369)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
X_train_cv  (1302, 10368)
3 Thresh=0.00726, n=1, mse: 48.590
CV - Window 4
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 10369) (336, 10369) (336, 10369)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
X_train_cv  (1617, 10368)
4 Thresh=0.00726, n=1, mse: 19.354
CV - Window 5
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 10369) (336, 1036

Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 10369) (336, 10369) (231, 10369)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
X_train_cv  (2562, 10368)
7 Thresh=0.00303, n=12, mse: 8.174
{'threshold': 0.0030266345, 'num_features': 12, 'mse_list': [38.078019850613536, 38.322129399977506, 18.896325772319987, 11.933688321862256, 8.173906015441437], 'mean_mse': 23.080813872042942}


CV - Window 3
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 10369) (336, 10369) (336, 10369)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
X_train_cv  (1302, 10368)
3 Thresh=0.00242, n=22, mse: 24.077
CV - Window 4
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 10369) (336, 10369) (336, 10369)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
X_train_cv  (1617, 1036

Window Train/Val/Test shape (1911, 10369) (336, 10369) (546, 10369)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
X_train_cv  (2247, 10368)
6 Thresh=0.00000, n=10368, mse: 10.236
CV - Window 7
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 10369) (336, 10369) (231, 10369)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
X_train_cv  (2562, 10368)
7 Thresh=0.00000, n=10368, mse: 9.567
{'threshold': 0.0, 'num_features': 10368, 'mse_list': [21.97052110725286, 31.415998767193038, 18.556845022390483, 10.236157845186723, 9.567022355335618], 'mean_mse': 18.349309019471747}


   threshold num_features                                           mse_list  \
0   0.007264            1  [48.590345694993744, 19.35422173603673, 21.543...   
1   0.006659            2  [53.82605493473728, 30.542101385357643, 21.020...   
2   0.004237            4  [87.55425827785686, 46.82209172792178, 3

In [77]:
orchestrator68.rcfe.summary

Unnamed: 0,threshold,num_features,mse_list,mean_mse
0,0.007264,1,"[48.590345694993744, 19.35422173603673, 21.543...",28.788041
1,0.006659,2,"[53.82605493473728, 30.542101385357643, 21.020...",28.841442
2,0.004237,4,"[87.55425827785686, 46.82209172792178, 39.5530...",43.332854
3,0.003632,10,"[28.051858730132977, 27.625896577647474, 26.70...",21.533327
4,0.003027,12,"[38.078019850613536, 38.322129399977506, 18.89...",23.080814
5,0.002421,22,"[24.077313937734214, 40.704113919699125, 14.40...",18.893013
6,0.001816,60,"[26.927297564523478, 30.18431732565236, 15.715...",17.350091
7,0.001211,225,"[32.7562837260561, 28.12962199353752, 16.31458...",19.308733
8,0.000605,1310,"[26.0555038377847, 36.0719404055451, 21.780336...",20.505325
9,0.0,10368,"[21.97052110725286, 31.415998767193038, 18.556...",18.349309


In [78]:
selected_features68 = ['date'] + list(orchestrator68.rcfe.mcr.feature_importance_df[orchestrator68.rcfe.mcr.feature_importance_df.threshold >= 0.001816].index)

In [79]:
orchestrator68.performHyperparameterOptimization(X_68_preselect[selected_features68], 100)

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 23) (336, 23) (546, 23)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 23) (336, 23) (231

Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[42.57016470226735, 34.859884330857156, 26.030077565125822, 11.508888636542013, 8.025592883884853]
mean mse: 24.59892162373544
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 23) (336, 23) (546, 23)
Window Train/Val/Test shape (1911, 2) (3

Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 23) (336, 23) (231, 23)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[33.59837991190923, 32.87462746367563, 12.762899699937266, 8.05442901661546, 6.017938526024163]
mean mse: 18.661654923632348
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09

Window Train/Val/Test shape (1911, 23) (336, 23) (546, 23)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 23) (336, 23) (231, 23)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[360.76569886710104, 290.06820967564323, 146.42957835001957, 108.64448525904967, 112.28648832670142]
mean mse: 203.63889209570297
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 23

Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 23) (336, 23) (546, 23)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 23) (336, 23) (231, 23)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[35.14763223416115, 29.284471968457368, 18.498721966199007, 10.679904186880536, 5.794041945528128]
mean mse: 19.88095446024524
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 23) (336, 23) (336, 23)
Window Train/

Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 23) (336, 23) (546, 23)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 23) (336, 23) (231, 23)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[30.38981675946473, 33.16027157462385, 20.077974219253537, 7.778643997932234, 5.457233949836956]
mean mse: 19.372788100222262
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07

Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 23) (336, 23) (546, 23)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 23) (336, 23) (231, 23)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[38.63198685669499, 40.83694145068348, 17.150512695730804, 11.511622722327374, 6.41430643781845]
mean mse: 22.90907403265102
Train [2017-0

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 23) (336, 23) (546, 23)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 23) (336, 23) (231

Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[33.27005858150594, 29.422376493022803, 24.882929214888218, 14.416865129573294, 5.756095701093285]
mean mse: 21.54966502401671
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 23) (336, 23) (546, 23)
Window Train/Val/Test shape (1911, 2) (3

Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 23) (336, 23) (231, 23)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[43.999462940688225, 31.3023281282369, 16.304168608049363, 10.83326487565622, 6.3214865095312085]
mean mse: 21.752142212432386
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-

Window Train/Val/Test shape (1911, 23) (336, 23) (546, 23)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 23) (336, 23) (231, 23)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[27.29564753273882, 28.610922454592778, 16.74257723619087, 8.422307632951256, 5.545386865937164]
mean mse: 17.323368344482176
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 23) (3

Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 23) (336, 23) (546, 23)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 23) (336, 23) (231, 23)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[360.76569886710104, 290.06820967564323, 146.42957835001957, 108.64448525904967, 112.28648832670142]
mean mse: 203.63889209570297
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 23) (336, 23) (336, 23)
Window Tra

Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 23) (336, 23) (546, 23)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 23) (336, 23) (231, 23)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[24.603834823661447, 29.465862608330887, 15.421489425813585, 7.469325857408628, 5.229622022994627]
mean mse: 16.438026947641838
Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-

Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 23) (336, 23) (546, 23)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 23) (336, 23) (231, 23)
Window Train/Val/Test shape (2226, 2) (336, 2) (231, 2)
[360.76569886710104, 290.06820967564323, 146.42957835001957, 108.64448525904967, 112.28648832670142]
mean mse: 203.63889209570297
Train [2

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 23) (336, 23) (336, 23)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 23) (336, 23) (546, 23)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 23) (336, 23) (231

In [80]:
orchestrator68.bo.best

{'colsample_bytree': 0.4,
 'gamma': 0.0,
 'learning_rate': 0.025,
 'max_depth': 8.0,
 'min_child_weight': 1.0,
 'n_estimators': 150.0,
 'subsample': 0.9}

In [291]:
orchestrator68.corr_selected_features_bi_week[selected_features68].to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/X_6_8.csv')

In [292]:
orchestrator68.corr_selected_features_bi_week_y.to_csv('/Users/Rohil/Documents/iGEM/yemen/sagemaker_input/y_6_8.csv')

In [81]:
# postselect_params_68 = {'colsample_bytree': 0.45,
#                          'gamma': 4.0,
#                          'learning_rate': 0.025,
#                          'max_depth': 10,
#                          'min_child_weight': 8.0,
#                          'n_estimators': 100,
#                          'subsample': 0.8}

postselect_params_68 = {'colsample_bytree': 0.4,
                         'gamma': 0.0,
                         'learning_rate': 0.025,
                         'max_depth': 8,
                         'min_child_weight': 1.0,
                         'n_estimators': 150,
                         'subsample': 0.9}

In [180]:
orchestrator68.deploy(0.001816, postselect_params_68)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 24) (336, 24) (336, 24)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 24) (336, 24) (336, 24)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 24) (336, 24) (336, 24)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 24) (336, 24) (546, 24)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 24) (336, 24) (231

In [104]:
orchestrator68.corr_selected_features_bi_week_y.sort_values(by='date')

Unnamed: 0,date,week_6_to_8_cases
2456,2017-06-19,7.143574
2441,2017-06-19,49.379311
2444,2017-06-19,17.993787
2443,2017-06-19,44.854502
2442,2017-06-19,3.873987
2446,2017-06-19,22.996410
2439,2017-06-19,28.036947
2438,2017-06-19,70.382174
2437,2017-06-19,10.251431
2445,2017-06-19,10.268290


In [None]:
import sagemaker                                  # Amazon SageMaker's Python SDK provides many helper functions
from sagemaker.predictor import csv_serializer    # Converts strings for HTTP POST requests on inference

from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

prefix = 'sagemaker/xgboost-ts'

In [None]:
deployRegressor = True
if deployRegressor == True:
    X = corr_selected_features_4_6[selected_features]
    y = corr_selected_features_4_6_y 
else:
    X = corr_selected_features_4_6 
    y = corr_selected_features_4_6_y 
    
X_train, X_cv, X_test = train_cv_test_split(X)
y_train, y_cv, y_test = train_cv_test_split(y)

In [35]:
X_train.head()

In [None]:
y_train.head()

In [None]:
pd.concat([y_train['week_4_to_6_cases'], X_train], axis=1).to_csv('train.csv', index=False, header=False)
pd.concat([y_cv['week_4_to_6_cases'], X_cv], axis=1).to_csv('validation.csv', index=False, header=False)
pd.concat([y_test['week_4_to_6_cases'], X_test], axis=1).to_csv('test.csv', index=False, header=False)


In [None]:
# Upload for Sagemaker jobs to pickup
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

In [None]:
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')
print('s3://{}/{}/train'.format(bucket, prefix))

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

sess = sagemaker.Session()

container = get_image_uri(region, 'xgboost')

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)

In [None]:
# Static parameters
# 'auc'
# eval_metric: Metric 'multi:softmax' is not supported. 
# Parameter 'eval_metric' should be one of these options:'rmse', 'mae', 'logloss', 'error', 'merror', 'mlogloss', 'auc', 'ndcg', 'map', 'ndcg-', 'map-', 'poisson-nloglik', 'gamma-nloglik', 'gamma-deviance', 'tweedie-nloglik'.
xgb.set_hyperparameters(eval_metric='mae', 
                        max_depth=3,
                        eta=0.1,
                        gamma=4,
                        min_child_weight=5,
                        subsample=0.9,
                        colsample_bytree=0.8,
                        silent=0,
                        objective='reg:linear',
                        num_round=500,
                        early_stopping_rounds=100,
                        rate_drop=0.3,
                        tweedie_variance_power=1.4)

#max_depth=[3,5,7]
#subsample=[0.7,0.8,0.9],
#colsample_bytree=[0.7,0.8,0.9],
hyperparameter_ranges = {
    'eta': ContinuousParameter(0.0, 1.0),
    'min_child_weight': ContinuousParameter(1.0, 10.0),
    'alpha': ContinuousParameter(0, 2),
    'max_depth': IntegerParameter(3, 12),
    'subsample': ContinuousParameter(0.3, 0.9),
    'colsample_bytree': ContinuousParameter(0.3, 0.9),
    'num_round': IntegerParameter(50, 1000)
}

In [None]:
#https://sagemaker.readthedocs.io/en/latest/tuner.html
tuner = HyperparameterTuner(xgb,
                            objective_metric_name = 'validation:mae',
                            objective_type = 'Minimize',
                            strategy='Bayesian',
                            hyperparameter_ranges = hyperparameter_ranges,
                            max_jobs=20,
                            max_parallel_jobs=5)

In [None]:
# Fit with train & validation data sets
tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [None]:
#Let's just run a quick check of the hyperparameter tuning jobs status to make sure it started successfully.
boto3.client('sagemaker').describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)['HyperParameterTuningJobStatus']

In [None]:
#Deploy the best trained model
xgb_predictor = tuner.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

In [None]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(X_test.as_matrix()[:, 1:])
len(predictions)

In [None]:
print('Test sample X {} y {} Predictions {}'.format(X_test.shape, y_test.shape, len(predictions)))

In [None]:
mean_squared_error(y_test['week_4_to_6_cases'], predictions)

In [None]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)

In [190]:
bo.run()

Train [2017-07-01 2017-08-15] Val [2017-08-16 2017-08-31] Test [2017-09-01 2017-09-16]
Window Train/Val/Test shape (966, 38) (336, 38) (336, 38)
Window Train/Val/Test shape (966, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-08-30] Val [2017-08-31 2017-09-15] Test [2017-09-16 2017-10-01]
Window Train/Val/Test shape (1281, 38) (336, 38) (336, 38)
Window Train/Val/Test shape (1281, 2) (336, 2) (336, 2)
Train [2017-07-01 2017-09-14] Val [2017-09-15 2017-09-30] Test [2017-10-01 2017-10-16]
Window Train/Val/Test shape (1596, 38) (336, 38) (336, 38)
Window Train/Val/Test shape (1596, 2) (336, 2) (336, 2)
Rolling window to end date
Train [2017-07-01 2017-09-29] Val [2017-09-30 2017-10-15] Test [2017-10-16 2017-11-10]
Window Train/Val/Test shape (1911, 38) (336, 38) (546, 38)
Window Train/Val/Test shape (1911, 2) (336, 2) (546, 2)
Rolling window to end date
Train [2017-07-01 2017-10-14] Val [2017-10-15 2017-10-30] Test [2017-10-31 2017-11-10]
Window Train/Val/Test shape (2226, 38) (336, 38) (231