In [35]:
import pandas as pd
import numpy as np
from math import inf
from sklearn.model_selection import train_test_split
from sklearn.svm import NuSVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from joblib import dump #more efficient than pickle on objects that carry large numpy arrays internally


In [36]:
#data = "mnist_4hp"
#data = "mnist_6hp"
data = "mlpf"
save = False

In [37]:
if data == "mnist_4hp":
    df_path = "../data/mnist/all_4hp_rusty.csv"
    num_epochs = 12
    min_hp_idx = 0
    max_hp_idx = 3
    min_curve_idx = 16
    pass
elif data == "mnist_6hp":
    df_path = "../data/mnist/all_6hp_rusty.csv"
    num_epochs = 12
    min_hp_idx = 0
    max_hp_idx =  5
    min_curve_idx = 18
elif data == "mlpf":
    df_path = "../data/mlpf/delphes_trainings_processed.csv"
    num_epochs = 100
    min_hp_idx = 0
    max_hp_idx =  6
    min_curve_idx = 7
    pass

In [38]:
df = pd.read_csv(df_path)
df = df.iloc[:300]
df.head()

Unnamed: 0,bin_size,dropout,lr,n_glayers_id,n_glayers_reg,output_dim,weight_decay,loss_0,loss_1,loss_2,...,loss_90,loss_91,loss_92,loss_93,loss_94,loss_95,loss_96,loss_97,loss_98,loss_99
0,128.0,0.46359,0.022231,3.0,1.0,256.0,2e-06,594.407349,544.932495,531.519531,...,525.441223,525.39093,525.250122,525.574036,525.778381,525.402771,525.664795,526.271851,525.879456,525.977356
1,16.0,0.49869,0.000366,0.0,4.0,16.0,7.4e-05,520.121765,469.917206,465.910004,...,455.923645,455.911194,455.950531,455.868439,455.894348,455.826233,455.88681,455.853271,455.890015,455.868774
2,8.0,0.480831,0.001082,3.0,3.0,64.0,0.000531,485.182037,459.956238,458.082489,...,455.620636,455.599152,455.42511,455.615448,455.573212,455.722961,455.509552,455.509644,455.581238,455.570374
3,8.0,0.389282,0.000419,3.0,3.0,64.0,0.004692,489.15155,471.651367,471.531952,...,475.541931,475.54895,475.603241,475.558105,475.589294,475.645935,475.803772,475.841248,475.729279,475.742157
4,64.0,0.211433,0.000245,3.0,2.0,32.0,0.000549,491.797546,462.088318,459.432373,...,457.422913,457.452606,457.498627,457.480835,457.496887,457.536346,457.623077,457.746948,457.771637,457.707794


In [39]:
known_curve = 0.25

hps = df[df.columns[min_hp_idx:max_hp_idx+1]].to_numpy()

curve = df[df.columns[min_curve_idx:min_curve_idx+int(num_epochs*known_curve)]].to_numpy()
target = df[df.columns[min_curve_idx+num_epochs-2]].to_numpy()

In [40]:
#calculate finite diferences of 1st and 2nd order
def finite_difs(curve):
    difs1 = []
    for i in range(curve.shape[0]):
        difs1.append([])
        for j in range(1,curve.shape[1]):
            difs1[i].append(curve[i][j]-curve[i][j-1])
    difs2 = []
    for i in range(curve.shape[0]):
        difs2.append([])
        for j in range(1,len(difs1[0])):
            difs2[i].append(difs1[i][j]-difs1[i][j-1])
    difs1 = np.array(difs1)
    difs2 = np.array(difs2)
    return difs1, difs2

In [41]:
difs1, difs2 = finite_difs(curve)
X = np.append(np.append(np.append(hps,curve,1),difs1,1),difs2,1)
y = target

In [42]:
#scale
x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()
X = x_scaler.fit_transform(X)
y = y_scaler.fit_transform(y.reshape(-1, 1))

In [43]:
#split in train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(str(X_train.shape)+" "+str(X_test.shape))

(176, 79) (44, 79)


In [44]:
#instatiate and train predictor
model = NuSVR()
model.fit(X_train,y_train.ravel())

NuSVR()

In [45]:
if save:
    dump(x_scaler,"x_scaler_"+data+".joblib") 
    dump(y_scaler,"y_scaler_"+data+".joblib")
    dump(model, "model_"+data+".joblib")

In [46]:
#MSE
y_pred = model.predict(X_test)
mean_squared_error(y_test,y_pred)

0.005696218439680634

In [47]:
#R^2
model.score(X_test,y_test)

0.9537738736184261

In [48]:
cvs = cross_val_score(model, X_train, y_train.ravel(), cv = 5, scoring='r2')
cvs, cvs.mean(), cvs.std()

(array([0.89496633, 0.96018429, 0.86640343, 0.96259027, 0.92500983]),
 0.9218308301868389,
 0.03724600379137587)

In [30]:
#try to optimize regressor hps

from sklearn.model_selection import cross_val_score

best, C_best, Nu_best, gamma_best = -inf, -inf, -inf, -inf
for _ in range(1000):
    C = np.exp(np.random.uniform(np.log(1e-3),np.log(10.0)))
    Nu = np.random.uniform(0,1)
    gamma = "scale"
    model = NuSVR(C=C,nu=Nu,gamma=gamma)
    cvs = cross_val_score(model, X_train, y_train.ravel(), cv = 5, scoring='r2').mean()
    if best < cvs:
        best = cvs
        C_best, Nu_best, gamma_best = C, Nu, gamma
best, C_best, Nu_best, gamma_best

(0.9303249289406447, 3.5939002700599953, 0.5323901462674117, 'scale')

In [31]:
model = NuSVR(C=C_best,nu=Nu_best,gamma=gamma_best)
model.fit(X_train,y_train.ravel())

cvs = cross_val_score(model, X_train, y_train.ravel(), cv = 5, scoring='r2').mean()
cvs

0.9303249289406447

In [32]:
#model.fit(X_train,y_train.ravel())
y_pred = model.predict(X_test)
mean_squared_error(y_test,y_pred), model.score(X_test,y_test)

(0.005496764551466672, 0.9553924879221978)