In [82]:
import numpy as np
import torch
import os
from os import path
from sklearn.model_selection import KFold
import pandas as pd
import zipfile
import urllib.request


class UCIDatasets():
    def __init__(self,  name,  data_path="", n_splits = 10):
        self.datasets = {
            "housing": "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data",
            "concrete": "https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls",
            "energy": "http://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx",
            "power": "https://archive.ics.uci.edu/ml/machine-learning-databases/00294/CCPP.zip",
            "wine": "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv",
            "yacht": "http://archive.ics.uci.edu/ml/machine-learning-databases/00243/yacht_hydrodynamics.data"}
        self.data_path = data_path
        self.name = name
        self.n_splits = n_splits
        self._load_dataset()


    def _load_dataset(self):
        if self.name not in self.datasets:
            raise Exception("Not known dataset!")
        if not path.exists(self.data_path+"UCI"):
            os.mkdir(self.data_path+"UCI")

        url = self.datasets[self.name]
        file_name = url.split('/')[-1]
        if not path.exists(self.data_path+"UCI/" + file_name):
            urllib.request.urlretrieve(
                self.datasets[self.name], self.data_path+"UCI/" + file_name)
        data = None


        if self.name == "housing":
            data = pd.read_csv(self.data_path+'UCI/housing.data',
                               header=0, delimiter="\s+").values
            self.data = data[np.random.permutation(np.arange(len(data)))]

        elif self.name == "concrete":
            data = pd.read_excel(self.data_path+'UCI/Concrete_Data.xls',
                                 header=0).values
            self.data = data[np.random.permutation(np.arange(len(data)))]
        elif self.name == "energy":
            data = pd.read_excel(self.data_path+'UCI/ENB2012_data.xlsx',
                                 header=0).values
            self.data = data[np.random.permutation(np.arange(len(data)))]
        elif self.name == "power":
            zipfile.ZipFile(self.data_path +"UCI/CCPP.zip").extractall(self.data_path +"UCI/CCPP/")
            data = pd.read_excel(self.data_path+'UCI/CCPP/Folds5x2_pp.xlsx', header=0).values
            np.random.shuffle(data)
            self.data = data
        elif self.name == "wine":
            data = pd.read_csv(self.data_path + 'UCI/winequality-red.csv',
                               header=1, delimiter=';').values
            self.data = data[np.random.permutation(np.arange(len(data)))]

        elif self.name == "yacht":
            data = pd.read_csv(self.data_path + 'UCI/yacht_hydrodynamics.data',
                               header=1, delimiter='\s+').values
            self.data = data[np.random.permutation(np.arange(len(data)))]

        kf = KFold(n_splits=self.n_splits)
        self.in_dim = data.shape[1] - 1
        self.out_dim = 1
        self.data_splits = kf.split(data)
        self.data_splits = [(idx[0], idx[1]) for idx in self.data_splits]
    
    def get_split(self, split=None, as_tensor=False):
        if split is None: split = 0
            
        train_index, test_index = self.data_splits[split]
        x_train, y_train = self.data[train_index, :self.in_dim], self.data[train_index, self.in_dim:]
        x_test, y_test = self.data[test_index, :self.in_dim], self.data[test_index, self.in_dim:]
        x_means, x_stds = x_train.mean(axis=0), x_train.std(axis=0)
        y_means, y_stds = y_train.mean(axis=0), y_train.std(axis=0)
        x_train = (x_train - x_means)/x_stds
        y_train = (y_train - y_means)/y_stds
        x_test = (x_test - x_means)/x_stds
        y_test = (y_test - y_means)/y_stds
        
        if as_tensor:
            inps = torch.from_numpy(x_train).float()
            tgts = torch.from_numpy(y_train).float()
            return inps, tgts, _, _
            train_data = torch.utils.data.TensorDataset(inps, tgts)
            
            inps = torch.from_numpy(x_test).float()
            tgts = torch.from_numpy(y_test).float()
            test_data = torch.utils.data.TensorDataset(inps, tgts)
            return train_data, test_data
        else:
            return x_train, y_train, x_test, y_test

In [83]:
uciHousing = UCIDatasets("housing")
x_train, y_train, _, _ = uciHousing.get_split(as_tensor=True)

In [39]:
def rmse(score):
    rmse = np.sqrt(-score)
    print(f'rmse= {"{:.2f}".format(rmse)}')

In [40]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble

kf =KFold(n_splits=5, shuffle=True, random_state=42)
score = cross_val_score(linear_model.LinearRegression(), x_train, y_train, cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold: {score}')

rmse(score.mean())

Scores for each fold: [-0.0752596  -0.07332082 -0.06452946 -0.07324666 -0.0734159 ]
rmse= 0.27


In [41]:
score = cross_val_score(tree.DecisionTreeRegressor(random_state= 42), x_train, y_train, cv=kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold: {score}')
rmse(score.mean())

Scores for each fold: [-0.07162991 -0.07821824 -0.06356005 -0.07771481 -0.07890834]
rmse= 0.27


In [42]:
score = cross_val_score(ensemble.RandomForestRegressor(random_state= 42), x_train, y_train.ravel(), cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold are: {score}')
rmse(score.mean())

Scores for each fold are: [-0.04093893 -0.04298801 -0.03442921 -0.03889087 -0.03991691]
rmse= 0.20


In [43]:
max_depth = [1,2,3,4,5,6,7,8,9,10]

for val in max_depth:
    score = cross_val_score(tree.DecisionTreeRegressor(max_depth= val, random_state= 42), x_train, y_train, cv= kf, scoring="neg_mean_squared_error")
    print(f'For max depth: {val}')
    rmse(score.mean())

For max depth: 1
rmse= 0.53
For max depth: 2
rmse= 0.38
For max depth: 3
rmse= 0.31
For max depth: 4
rmse= 0.28
For max depth: 5
rmse= 0.26
For max depth: 6
rmse= 0.25
For max depth: 7
rmse= 0.25
For max depth: 8
rmse= 0.25
For max depth: 9
rmse= 0.24
For max depth: 10
rmse= 0.25


In [46]:
estimators = [10, 50, 100, 150, 200, 250, 300]

for count in estimators:
    score = cross_val_score(ensemble.RandomForestRegressor(n_estimators= count, random_state= 42), x_train, y_train.ravel(), cv= kf, scoring="neg_mean_squared_error")
    print(f'For estimators: {count}')
    rmse(score.mean())

For estimators: 10
rmse= 0.21
For estimators: 50
rmse= 0.20
For estimators: 100
rmse= 0.20
For estimators: 150
rmse= 0.20


KeyboardInterrupt: 

In [70]:
from torch import nn

class NNRegressor(nn.Module):
    def __init__(self, layers=[20,30,20], n_features=13, activation=nn.ReLU):
        super().__init__()
        self.layers = []
        self.activation_functions = []
        
        self.layers.append(nn.Linear(n_features, layers[0]))
        self.activation_functions.append(activation())
        self.add_module(f"layer{0}", self.layers[-1])
        self.add_module(f"act{0}", self.activation_functions[-1])
        
        for i in range(1, len(layers)):
            self.layers.append(nn.Linear(self.layers[-1].out_features, layers[i]))
            self.activation_functions.append(activation())
            self.add_module(f"layer{i}", self.layers[-1])
            self.add_module(f"act{i}", self.activation_functions[-1])

        self.output = nn.Linear(self.layers[-1].out_features, 1)

    def forward(self, x):
        for layer, act in zip(self.layers, self.activation_functions):
            x=act(layer(x))

        x = self.output(x)
        return x

In [71]:
from skorch import NeuralNetRegressor

model = NeuralNetRegressor(
    module=NNRegressor,
    max_epochs=100,
    batch_size=10,
    module__layers=[10,20],
    criterion=nn.MSELoss,
)

In [76]:
from sklearn.model_selection import GridSearchCV
from torch import optim

param_grid = {
    'optimizer': [optim.Adam, optim.Adamax, optim.NAdam],
    'module__activation': [nn.Identity, nn.ReLU, nn.GELU, nn.Tanh, nn.Sigmoid],
    'batch_size': [10, 25, 50],
    'module__layers': [[10,20,10],[30,20,10],[30,30,30]],
    'lr': [0.0001, 0.001, 0.01],
}

grid = GridSearchCV(
    estimator = NeuralNetRegressor(module=NNRegressor),
    param_grid = param_grid,
    n_jobs = -1,
    cv = 3,
    error_score = 'raise',
    return_train_score = True,
    verbose = 0
)

In [77]:
grid.fit(x_train, y_train)

  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1        [36m1.1213[0m        [32m1.4063[0m  0.0369
      2        [36m1.0731[0m        [32m1.3474[0m  0.0115
      3        [36m1.0266[0m        [32m1.2899[0m  0.0190
      4        [36m0.9808[0m        [32m1.2329[0m  0.0109
      5        [36m0.9352[0m        [32m1.1762[0m  0.0097
      6        [36m0.8899[0m        [32m1.1199[0m  0.0102
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1        [36m0.9138[0m        [32m1.1678[0m  0.0495
      7        [36m0.8449[0m        [32m1.0643[0m  0.0109
      2        [36m0.8875[0m        [32m1.1396[0m  0.0173
      8        [36m0.8008[0m        [32m1.0102[0m  0.0187
      3        [36m0.8625[0m        [32m1.1117[0m  0.0125
      9        [36m0.7580[0m        [32m0.9582[0m  0.0094
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------ 

In [78]:
grid.best_params_

{'batch_size': 25,
 'lr': 0.01,
 'module__activation': torch.nn.modules.activation.GELU,
 'module__layers': [30, 20, 10],
 'optimizer': torch.optim.adam.Adam}

In [79]:
grid.best_score_

0.8547388440169458

In [87]:
grid.best_estimator_.module_.eval()

NNRegressor(
  (layer0): Linear(in_features=13, out_features=30, bias=True)
  (act0): GELU(approximate='none')
  (layer1): Linear(in_features=30, out_features=20, bias=True)
  (act1): GELU(approximate='none')
  (layer2): Linear(in_features=20, out_features=10, bias=True)
  (act2): GELU(approximate='none')
  (output): Linear(in_features=10, out_features=1, bias=True)
)