In [1]:
import torch
import sklearn
import numpy as np

In [20]:
import math
features_to_log = ['lake_elevation_m', 'lake_totalarea_ha', 'lake_perimeter_m', 'lake_shorelinedevfactor', 'lake_mix_layer_temperature', 'lake_mix_layer_temperature_min', 'lake_mix_layer_temperature_max', 'lake_mix_layer_depth', 'lake_mix_layer_depth_min', 'lake_mix_layer_depth_max', 'total_precipitation_sum', 'total_precipitation_min', 'total_precipitation_max']


def np_RMSE(vec1,vec2, key):
    #Exponentiate outputs if logged
    if (key in features_to_log):
        vec1 = np.vectorize(lambda x: math.exp(x)-1)(vec1)
        vec2 = np.vectorize(lambda x: math.exp(x)-1)(vec2)
    return np.sqrt(np.mean((vec1-vec2)**2))

In [3]:
#Dataset block is 1 reduction method and 1 parameter
#Model block is 
def test_model(model_block, dataset_block, par):
    model = model_block['model']
    space_norm_RMSE = np_RMSE(model.predict(dataset_block["space_test"][par]["independent"]),dataset_block["space_test"][par]["dependent"], par)/(dataset_block["space_test"][par]["dependent"].mean())
    time_norm_RMSE = np_RMSE(model.predict(dataset_block["time_test"][par]["independent"]),dataset_block["time_test"][par]["dependent"], par)/(dataset_block["time_test"][par]["dependent"].mean())
    model_block['space_norm_RMSE'] = space_norm_RMSE
    model_block['time_norm_RMSE'] = time_norm_RMSE
    return model_block

In [2]:

def test_model_ffn(model_block, dataset_block, par):
    model = model_block['model']
    space_test_torch_ind = torch.from_numpy(dataset_block["space_test"][par]["independent"].astype('float32'))
    time_test_torch_ind = torch.from_numpy(dataset_block["time_test"][par]["independent"].astype('float32'))
    space_norm_RMSE = np_RMSE(model.predict(space_test_torch_ind).view().squeeze(),dataset_block["space_test"][par]["dependent"], par)/(dataset_block["space_test"][par]["dependent"].mean())
    time_norm_RMSE = np_RMSE(model.predict(time_test_torch_ind).view().squeeze(),dataset_block["time_test"][par]["dependent"], par)/(dataset_block["time_test"][par]["dependent"].mean())
    model_block['space_norm_RMSE'] = space_norm_RMSE
    model_block['time_norm_RMSE'] = time_norm_RMSE
    return model_block

In [None]:
#Dummy

In [22]:
from sklearn.dummy import DummyRegressor

def train_dummy(formatted_dataset):
    dummy_models = {}
    for key,obs_set in formatted_dataset.items():

        reg = DummyRegressor(strategy='mean').fit(obs_set["X_train"], obs_set["y_train"])
        dummy_models[key] = {"model":reg, "score":reg.score(obs_set["X_test"], obs_set["y_test"]), "RMSE":np_RMSE(reg.predict(obs_set["X_test"]),obs_set["y_test"], key)}
        dummy_models[key]['norm_RMSE']=dummy_models[key]['RMSE']/(obs_set["y_test"].mean())
    return dummy_models

In [None]:
#RF

In [3]:
from sklearn import datasets, ensemble
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

def train_RF(formatted_dataset, params):
    RF_models = {}
    for key,obs_set in formatted_dataset.items():
        X = obs_set["X_train"]
        y = obs_set["y_train"]
        
        rf = RandomForestRegressor() #GRADIENT BOOSTING TREE AND RANDOM FOREST ARE DIFFERENT
        crf = GridSearchCV(rf,params, verbose = 1)
        crf.fit(X,y)
        
        best_model = crf.best_estimator_
        
        RF_models[key] = {"gridsearch": crf, "model":best_model, "params": crf.get_params(), "score":crf.score(obs_set["X_test"], obs_set["y_test"]), "RMSE":np_RMSE(crf.predict(obs_set["X_test"]),obs_set["y_test"], key)}
        RF_models[key]['norm_RMSE']=RF_models[key]['RMSE']/(obs_set["y_test"].mean())
    return RF_models

In [3]:
from sklearn import datasets, ensemble
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

def train_SVR(formatted_dataset, params):
    SVR_models = {}
    for key,obs_set in formatted_dataset.items():
        X = obs_set["X_train"]
        y = obs_set["y_train"]
        
        rf = make_pipeline(StandardScaler(), SVR()) 
        crf = GridSearchCV(rf,params, verbose = 1)
        crf.fit(X,y)
        
        best_model = crf.best_estimator_
        
        SVR_models[key] = {"gridsearch": crf, "model":best_model, "params": crf.get_params(), "score":crf.score(obs_set["X_test"], obs_set["y_test"]), "RMSE":np_RMSE(crf.predict(obs_set["X_test"]),obs_set["y_test"], key)}
        SVR_models[key]['norm_RMSE']=SVR_models[key]['RMSE']/(obs_set["y_test"].mean())
    return SVR_models

In [4]:
#KNN
from sklearn import datasets, ensemble
from sklearn.neighbors import KNeighborsRegressor
def train_KNN(formatted_dataset, params):
    KNN_models = {}
    for key,obs_set in formatted_dataset.items():
        X = obs_set["X_train"]
        y = obs_set["y_train"]
        rf = KNeighborsRegressor() #GRADIENT BOOSTING TREE AND RANDOM FOREST ARE DIFFERENT
        crf = GridSearchCV(rf,params, verbose = 1)
        crf.fit(X,y)
        best_model = crf.best_estimator_
        KNN_models[key] = {"gridsearch": crf, "model":best_model, "params": crf.get_params(), "score":crf.score(obs_set["X_test"], obs_set["y_test"]), "RMSE":np_RMSE(crf.predict(obs_set["X_test"]),obs_set["y_test"], key)}
        KNN_models[key]['norm_RMSE']=KNN_models[key]['RMSE']/(obs_set["y_test"].mean())
    return KNN_models

In [None]:
#Linear
from sklearn import datasets, linear_model

def train_Linear(formatted_dataset):
    linear_models = {}
    for key,obs_set in formatted_dataset.items():
        X = obs_set["X_train"]
        y = obs_set["y_train"]
        reg = linear_model.LinearRegression().fit(X, y)
        linear_models[key] = {"model":reg, "score":reg.score(obs_set["X_test"], obs_set["y_test"]), "RMSE":np_RMSE(reg.predict(obs_set["X_test"]),obs_set["y_test"], key)}
        linear_models[key]['norm_RMSE']=linear_models[key]['RMSE']/(obs_set["y_test"].mean())
    return linear_models

In [2]:
#NN
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import r2_score

class predictor_RELU(nn.Module):
    def __init__(self):
        super(predictor_RELU, self).__init__()
        
        
        self.fc1 = nn.Linear(35,78)
        self.act = nn.ReLU()
        self.dropout = nn.Dropout(p=0.3)
        self.fc2 = nn.Linear(78,30)
        self.act2 = nn.ReLU()
        self.dropout2 = nn.Dropout(p=0.3)
        self.fc3 = nn.Linear(30,1)
        #To ensure can inverse sig later

    def forward(self, input, **kwargs):
        return self.fc3(self.dropout2(self.act2(self.fc2(self.dropout(self.act(self.fc1(input)))))))

    
        


learning_rate = 1e-5 #Learning rate of 1e-5 doesnt even work this is so finnicky
batch_size = 64

In [61]:
from skorch import NeuralNet

class ScoredNeuralNet(NeuralNet): #Subclass of wrapper from skorch with score
    def __init__(self, **kwargs):
        super(ScoredNeuralNet, self).__init__(**kwargs)

    def score(self, X, y):
        return r2_score(self.predict(X),y)

def train_FFN_RELU(dataset, params): #Add parameters later (Try higher learning rate)
    ffn_models = {}
    for key,obs_set in dataset.items():
        model = ScoredNeuralNet(module = predictor_RELU, criterion = nn.MSELoss(), lr = 1e-4,  max_epochs = 60, optimizer = torch.optim.SGD, iterator_train__shuffle=True) 
        crf = GridSearchCV(model, params, refit = True, cv=5, verbose=3)#Change to 5 later, this is just for testing rn

        
        crf.fit(dataset[key]["X_train"].astype('float32'), np.expand_dims(dataset[key]["y_train"], axis = 1).astype('float32'))
    
        best_model = crf.best_estimator_
        ffn_models[key]={"gridsearch":crf,"model":best_model,"loss":best_model.score(dataset[key]["X_test"].astype('float32'),dataset[key]["y_test"].astype('float32')), "RMSE":np_RMSE(best_model.predict(dataset[key]["X_test"].astype('float32')),dataset[key]["y_test"].astype('float32').squeeze(), key)}
        ffn_models[key]['norm_RMSE']=ffn_models[key]['RMSE']/(obs_set["y_test"].mean())
    return ffn_models

In [None]:
#XGBoost

In [1]:
from xgboost import XGBRegressor
# read data
from sklearn.datasets import load_iris

def train_XG_boost(dataset, params):
    XG_boost_models = {}
    for key,obs_set in dataset.items():

        bst = XGBRegressor();

        crf = GridSearchCV(bst, params, refit = True, cv=5, verbose=3) 
    # fit model
        
        crf.fit(obs_set['X_train'].astype('float32'), obs_set['y_train'].astype('float32'))

        best_model = crf.best_estimator_
        XG_boost_models[key]={"gridsearch":crf,"model":crf,"score":crf.score(dataset[key]["X_test"].astype('float32'),dataset[key]["y_test"].astype('float32')), "RMSE":np_RMSE(crf.predict(dataset[key]["X_test"].astype('float32')),dataset[key]["y_test"].astype('float32').squeeze(), key)}
        XG_boost_models[key]['norm_RMSE']=XG_boost_models[key]['RMSE']/(obs_set["y_test"].mean())
    return XG_boost_models