# Notebook options

In [42]:
grid_search = False

# Imports and display options

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from hypopt import GridSearch
from preprocessing import get_df
from preprocessing import scale

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 75

# Load data

In [3]:
df = get_df()

# Wall time models

## Feature selection

In [4]:
time_df = df[df.rqst_timespan.notnull() & df.rqst_area_rect.notnull()]
time_X_features = ['PP', 'SP', 'BR', 'rqst_timespan', 'rqst_area_rect', 'converted',
              'params_num', 'grid_def_num', 'level_num',
              'ds084.1', 'ds631.1', 'ds083.3', 'ds094.0', 'ds083.2']
time_y_features = ['wall_time']

time_X = time_df[time_X_features]
time_y = time_df[time_y_features]

## Train/validation/test split

In [5]:
time_train_amt = 0.5
time_val_amt = 0.25
time_test_amt = 0.25

In [6]:
time_X_train, time_X_target, time_y_train, time_y_target = \
                train_test_split(time_X, time_y, 
                test_size=1-time_train_amt, 
                random_state = 3)
time_X_val, time_X_test, time_y_val, time_y_test = \
                train_test_split(time_X_target, time_y_target,
                                 test_size = time_test_amt/time_train_amt,
                                 random_state = 3)


In [7]:
time_y_train = np.ravel(time_y_train)
time_y_val = np.ravel(time_y_val)
time_y_test = np.ravel(time_y_test)

## Scaling

In [9]:
time_X_train_norm, time_X_val_norm, time_X_test_norm = \
        scale(time_X_train, time_X_val, time_X_test)

## Trees, Forest, Gradient Boosts

### Tree

In [43]:
if grid_search:
    time_tree_param_grid = {'random_state':[3], 'max_depth':range(2,15)}
    time_tree_gs = GridSearch(model=DecisionTreeRegressor(), 
                              param_grid=time_tree_param_grid,
                              parallelize=False)
    time_tree_gs.fit(time_X_train_norm, time_y_train, time_X_val_norm, time_y_val)
    print(time_tree_gs.best_params)
    time_tree = time_tree_gs.best_estimator_
    
else:
    time_tree_params = {'max_depth': 14, 'random_state': 3}
    time_tree = DecisionTreeRegressor(**time_tree_params)
    time_tree.fit(time_X_train_norm, time_y_train)

### Forest

In [44]:
if grid_search:
    time_forest_param_grid = {'random_state':[3], 'max_depth':range(2,15),
                         'n_estimators':[100,200,300,500]}
    time_forest_gs = GridSearch(model=RandomForestRegressor(), 
                                param_grid=time_forest_param_grid,
                                parallelize=False)
    time_forest_gs.fit(time_X_train_norm, time_y_train, time_X_val_norm, time_y_val)
    print(time_forest_gs.best_params)
    time_forest = time_forest_gs.best_estimator_
    
else:
    time_forest_params = {'max_depth': 14, 'n_estimators': 500, 'random_state': 3}
    time_forest = RandomForestRegressor(**time_forest_params)
    time_forest.fit(time_X_train_norm, time_y_train)

### Gradient boosted trees

In [45]:
if grid_search:
    time_gboost_param_grid = {'random_state':[3], 
                              'max_depth':range(2,15),
                              'n_estimators':[100,200,300,500]}
    time_gboost_gs = GridSearch(model=GradientBoostingRegressor(),
                                param_grid=time_gboost_param_grid,
                                parallelize=False)
    time_gboost_gs.fit(time_X_train_norm, time_y_train, time_X_val_norm, time_y_val)
    print(time_gboost_gs.best_params)
    time_gboost = time_gboost_gs.best_estimator_

else:
    time_gboost_params = {'max_depth': 9, 'n_estimators': 100, 'random_state': 3}
    time_gboost = GradientBoostingRegressor(**time_gboost_params)
    time_gboost.fit(time_X_train_norm, time_y_train)

## Linear regressors

### Basic linear regression

In [23]:
linear = LinearRegression().fit(time_X_train_norm, time_y_train)

### Ridge regression

In [24]:
ridge10 = Ridge(alpha=10).fit(time_X_train_norm, time_y_train)
ridge1 = Ridge(alpha=1).fit(time_X_train_norm, time_y_train)
ridge01 = Ridge(alpha=0.1).fit(time_X_train_norm, time_y_train)

### Lasso regression

In [25]:
lasso = Lasso(alpha=0.01, max_iter=100000).fit(time_X_train_norm, time_y_train)

# Used memory models

In [26]:
mem_df = df[df.rqst_timespan.notnull() & df.rqst_area_rect.notnull()]
mem_X_features = ['PP', 'SP', 'BR', 'rqst_timespan', 'rqst_area_rect', 'converted',
              'params_num', 'grid_def_num', 'level_num',
              'ds084.1', 'ds631.1', 'ds083.3', 'ds094.0', 'ds083.2']
mem_y_features = ['used_mem']

mem_X = mem_df[mem_X_features]
mem_y = mem_df[mem_y_features]

## Train/validation/test split

In [27]:
mem_train_amt = 0.5
mem_val_amt = 0.25
mem_test_amt = 0.25

In [28]:
mem_X_train, mem_X_target, mem_y_train, mem_y_target = \
                        train_test_split(mem_X, mem_y, 
                                         test_size=1-mem_train_amt, 
                                         random_state = 3)
mem_X_val, mem_X_test, mem_y_val, mem_y_test = \
                        train_test_split(mem_X_target, mem_y_target,
                                         test_size = mem_test_amt/mem_train_amt,
                                         random_state = 3)


In [29]:
mem_y_train = np.ravel(mem_y_train)
mem_y_val = np.ravel(mem_y_val)
mem_y_test = np.ravel(mem_y_test)

## Scaling

In [30]:
mem_X_train_norm, mem_X_val_norm, mem_X_test_norm = \
        scale(mem_X_train, mem_X_val, mem_X_test)

## Trees, Forest, Gradient Boosts

### Tree

In [31]:
mem_tree_param_grid = {'random_state':[3], 'max_depth':range(2,15)}
mem_tree_gs = GridSearch(model=DecisionTreeRegressor(), 
                         param_grid=mem_tree_param_grid,
                         parallelize=False)
mem_tree_gs.fit(mem_X_train_norm, mem_y_train, mem_X_val_norm, mem_y_val)


DecisionTreeRegressor(max_depth=14, random_state=3)

In [32]:
mem_tree_gs.best_params

{'max_depth': 13, 'random_state': 3}

In [33]:
mem_tree = mem_tree_gs.best_estimator_

In [None]:
#mem_tree_params = {'max_depth': 13, 'random_state': 3}
#mem_tree = DecisionTreeRegressor(**mem_tree_params)
#mem_tree.fit(mem_X_train_norm, mem_y_train)

### Forest

In [34]:
mem_forest_param_grid = {'random_state':[3], 'max_depth':range(2,15),
                     'n_estimators':[100,200,300,500]}
mem_forest_gs = GridSearch(model=RandomForestRegressor(), 
                           param_grid=mem_forest_param_grid,
                           parallelize=False)
mem_forest_gs.fit(mem_X_train_norm, mem_y_train, mem_X_val_norm, mem_y_val)

RandomForestRegressor(max_depth=14, n_estimators=500, random_state=3)

In [35]:
mem_forest_gs.best_params

{'max_depth': 9, 'n_estimators': 200, 'random_state': 3}

In [36]:
mem_forest = mem_forest_gs.best_estimator_

In [37]:
#mem_forest_params = {'max_depth': 9, 'n_estimators': 200, 'random_state': 3}
#mem_forest = RandomForestRegressor(**mem_forest_params)
#mem_forest.fit(mem_X_train_norm, mem_y_train)

### Gradient boosted trees

In [38]:
mem_gboost_param_grid = {'random_state':[3], 
                          'max_depth':range(2,15),
                          'n_estimators':[100,200,300,500]}
mem_gboost_gs = GridSearch(model=GradientBoostingRegressor(),
                            param_grid=mem_gboost_param_grid,
                            parallelize=False)
mem_gboost_gs.fit(mem_X_train_norm, mem_y_train, mem_X_val_norm, mem_y_val)

GradientBoostingRegressor(max_depth=14, n_estimators=500, random_state=3)

In [39]:
mem_gboost_gs.best_params

{'max_depth': 4, 'n_estimators': 500, 'random_state': 3}

In [40]:
mem_gboost = mem_gboost_gs.best_estimator_

In [41]:
#mem_gboost_params = {'max_depth': 4, 'n_estimators': 500, 'random_state': 3}
#mem_gboost = GradientBoostingRegressor(**mem_gboost_params)
#mem_gboost.fit(mem_X_train_norm, mem_y_train)