# Hyperparameteroptimization

https://github.com/SGpp/SGpp/blob/master/optimization/examples/optimization.py

Steps:
   1. Definition of the data
   2. Definition of the hyperparameter space
   3. Loop over all different combinations of the hyperparamter space
       1. Define the model with the hyperparameters
       2. Optimize model (learning phase)
       3. Evaluate model and store metric with the parameters
 

## 0. Imports & utils

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import torch
from sklearn import svm, metrics
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score


import timeit

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import itertools

from mpl_toolkits import mplot3d


from scipy.io import arff
import pandas as pd

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

import time, sys
from IPython.display import clear_output

import pysgpp

from bayes_opt import BayesianOptimization, UtilityFunction


def update_progress(progress, time, remaining_time):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    text += "\nCurrent time per iteration: " + str(time)
    text += "\nApprox. time remaining: " + str(remaining_time)
    print(text)

    
def to_standard(lower, upper, value):
    return (value-lower)/(upper-lower)


def from_standard(lower, upper, value):
    return value*(upper-lower)+lower

## 1. Definition of Data

In [2]:
SPLIT_RATIO = 0.8

data = arff.loadarff('php0iVrYT.arff')
df = pd.DataFrame(data[0])

df['Class'].replace([b'1', b'2'], [1, 2], inplace=True)

X = []
Y = []

for i in range(len(df)):
    data = []
    data.append(df['V1'][i])
    data.append(df['V2'][i])
    data.append(df['V3'][i])
    data.append(df['V4'][i])
    
    X.append(data)
    Y.append(df['Class'][i])

print("Number of samples: ", len(X))

X, Y = shuffle(X, Y)
X = np.array(X[:740])
Y = np.array(Y[:740])

# 10 fold validation:
size_chunks = int(len(X)/10)
X_folds = [X[x:x+size_chunks] for x in range(0, len(X)-size_chunks, size_chunks)]
Y_folds = [Y[x:x+size_chunks] for x in range(0, len(Y)-size_chunks, size_chunks)]

#X_train = torch.Tensor(X[:int(len(X) * SPLIT_RATIO)])
#X_test = torch.Tensor(X[int(len(X) * SPLIT_RATIO):])
#Y_train = torch.Tensor(Y[:int(len(Y) * SPLIT_RATIO)])
#Y_test = torch.Tensor(Y[int(len(Y) * SPLIT_RATIO):])

#print("Number of training samples: ", len(X_train))
#print("Number of testing samples: ", len(X_test))

Number of samples:  748


## 2. Definition of Hyperparameter space

In [3]:
hyperparameters = {
    "C" : [0, 10],
    "kernel" : [1, 5], #["set", "linear", "poly", "rbf", "sigmoid", "precomputed"],
    "degree" : [1, 5],
    "gamma" : [1, 3], #["set", "scale", "auto"],
    "tol" : [0.000001, 0.1]
}

print("Number of hyperparameters: ", len(hyperparameters))

Number of hyperparameters:  5


## 3. Loop over combinations

In [None]:
class ExampleFunction(pysgpp.ScalarFunction):
    """Example objective function from the title of my Master's thesis."""

    def __init__(self):
        super(ExampleFunction, self).__init__(2)


    def eval(self, x):
        
        C = x[0]
        kernel = x[1]
        degree = x[2]
        gamma = x[3]
        tol = x[4]
        
        if kernel < 0.2:
            kernel = "linear"
        elif kernel < 0.4:
            kernel = "poly"
        elif kernel < 0.6:
            kernel = "rbf"
        elif kernel < 0.8:
            kernel = "sigmoid"
        elif kernel <= 1:
            kernel = "precomputed"
        
        if gamma < 0.5:
            gamma = "scale"
        elif gamma <= 1:
            gamma = "auto"
        
        classifier = svm.SVC(C=C, kernel=kernel, degree=int(degree), gamma=gamma, tol=tol)
        
        scores = cross_val_score(classifier, X, Y, cv=10)
        
        accuracy = scores.mean()
        return -accuracy
################################## generate Grid ##################################

pysgpp.omp_set_num_threads(1)

f = ExampleFunction()
# dimension of domain
d = f.getNumberOfParameters()
# B-spline degree
p = 4
# maximal number of grid points
N = 10
# adaptivity of grid generation
gamma = 0.9


grid = pysgpp.Grid.createModBsplineGrid(d, p)
gridGen = pysgpp.OptIterativeGridGeneratorRitterNovak(f, grid, N, gamma)

functionValues = gridGen.getFunctionValues()

if not gridGen.generate():
    print("Grid generation failed, exiting.")
    sys.exit(1)

gridStorage = grid.getStorage()

x_values = []
y_values = []
for i in range(gridStorage.getSize()):
    gp = gridStorage.getPoint(i)
    x_values.append(gp.getStandardCoordinate(0)) 
    y_values.append(gp.getStandardCoordinate(1))
    
    
plt.plot(x_values, y_values, 'bo')

######################################## grid functions ########################################
# Hierarchization
functionValues = gridGen.getFunctionValues()
coeffs = pysgpp.DataVector(len(functionValues))
hierSLE = pysgpp.HierarchisationSLE(grid)
sleSolver = pysgpp.AutoSLESolver()

if not sleSolver.solve(hierSLE, gridGen.getFunctionValues(), coeffs):
    print("Solving failed, exiting.")
    sys.exit(1)

# define interpolant and gradient
ft = pysgpp.InterpolantScalarFunction(grid, coeffs)
ftGradient = pysgpp.InterpolantScalarFunctionGradient(grid, coeffs)
gradientDescent = pysgpp.OptGradientDescent(ft, ftGradient)
x0 = pysgpp.DataVector(d)

##################### find point with minimal loss (which are already evaluated) #################

# find point with smallest value as start point for gradient descent
x0Index = 0
fX0 = functionValues[0]
for i in range(1, len(functionValues)):
    if functionValues[i] < fX0:
        fX0 = functionValues[i]
        x0Index = i

x0 = gridStorage.getCoordinates(gridStorage.getPoint(x0Index));
ftX0 = ft.eval(x0)

print("\nOptimal hyperparameters so far:")
print("Epochs: ", from_standard(1,300,x0[1]))
print("learning_rate: ", from_standard(0.00001,0.01,x0[0]))

print("Resulting loss:")
print(ftX0)

################################## Optimize with gradient descent ##################################
#print("x0 = {}".format(x0))
#print("f(x0) = {:.6g}, ft(x0) = {:.6g}\n".format(fX0, ftX0))

## We apply the gradient method and print the results.
gradientDescent.setStartingPoint(x0)
gradientDescent.optimize()
xOpt = gradientDescent.getOptimalPoint()
ftXOpt = gradientDescent.getOptimalValue()

print(xOpt)
fXOpt = f.eval(xOpt)

print("\nOptimal hyperparameters after optimization:")
print("Epochs: ", from_standard(1,300,xOpt[1]))
print("learning_rate: ", from_standard(0.00001,0.01,xOpt[0]))
print("Resulting loss (Optimal value from optimization):")
print(ftXOpt)
print("Resulting loss (Optimal point evaluated):")
print(fXOpt)
#print("\nxOpt = {}".format(xOpt))
#print("f(xOpt) = {:.6g}, ft(xOpt) = {:.6g}\n".format(fXOpt, ftXOpt))

Adaptive grid generation (Ritter-Novak)...
