In [170]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [171]:
df = pd.read_pickle('../../data/ava_st1_ns4_56.pkl')
df.head


<bound method NDFrame.head of        apcp_sf1_1  apcp_sf2_1 apcp_sf3_1  apcp_sf4_1  apcp_sf5_1  dlwrf_s1_1  \
V1       0.000000         0.0          1    0.000000    0.000000  256.492673   
V2       0.000000         0.0          1    0.017273    0.139091  257.998596   
V3       0.000000         0.0          1    0.000000    0.000000  219.280002   
V4       0.004545         0.0          1    0.000000    0.000000  267.863045   
V5       0.000000         0.0          1    0.000000    0.000000  238.162747   
...           ...         ...        ...         ...         ...         ...   
V4376    0.010909         0.0          1    0.000000    0.000000  278.168651   
V4377    0.000000         0.0          1    0.000000    0.000000  251.551092   
V4378    0.000000         0.0          1    0.000000    0.000000  269.446164   
V4379    0.000000         0.0          1    0.000000    0.000000  268.862049   
V4380    0.000000         0.0          1    0.000000    0.000000  269.112621   

       dl

In [172]:
x = df.drop('energy', axis=1)
y = df['energy']


In [173]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split


First we do preprocessing on the data. For SVM, we need to do scaling.

In [191]:
preprocessor = ColumnTransformer(
    transformers = [
        ('STscaler', StandardScaler(), make_column_selector(dtype_include=np.float64)),
        ('MMscaler', MinMaxScaler(), make_column_selector(dtype_include=np.int64))
    ],
    remainder = 'passthrough'
)

preprocessor.fit(x)

X =  preprocessor.transform(x)
print(y)

[1313 1184 1330 ... 1176 1020  752]


**Cross Validation and Hyperparameter tuning for SVM using GridSearch, RandomSearch, and Bayesian**

Check for imbalance in the dataset and then stratify when splitting the data into train and test sets. In this case, we cant stratify because the least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2. Split the data into training and test sets. Use the training set to train the model and the test set to evaluate the model. 

In [175]:
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
y = le.fit_transform(y)
from collections import Counter

# Counting the number of instances per class (checking for imbalance)
counter = Counter(y)
print(f"Instances per class: {counter}")

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


Instances per class: Counter({1005: 13, 1585: 4, 1497: 2, 826: 2, 2394: 2, 3818: 2, 3844: 2, 119: 2, 797: 2, 3698: 2, 1612: 2, 2392: 2, 3160: 2, 3624: 2, 298: 2, 1191: 2, 1030: 2, 30: 2, 3223: 2, 3861: 2, 4059: 2, 4055: 2, 2489: 2, 3766: 2, 3700: 2, 3520: 2, 221: 2, 2317: 2, 1455: 2, 1026: 2, 1206: 2, 1165: 2, 1200: 2, 1684: 2, 1941: 2, 3188: 2, 3561: 2, 3392: 2, 3775: 2, 3268: 2, 2822: 2, 2131: 2, 2847: 2, 2111: 2, 230: 2, 1157: 2, 1604: 2, 1493: 2, 3502: 2, 3500: 2, 1437: 2, 4091: 2, 2187: 2, 3132: 2, 2763: 2, 2438: 2, 871: 2, 1138: 2, 936: 2, 436: 2, 3039: 2, 4252: 2, 3848: 2, 2506: 2, 1387: 2, 2759: 2, 2406: 2, 3475: 2, 2756: 2, 3993: 2, 1390: 2, 1985: 2, 3798: 2, 1617: 2, 446: 2, 2815: 2, 2186: 2, 1486: 2, 821: 2, 1464: 2, 2483: 2, 3849: 2, 217: 2, 1495: 2, 1320: 2, 1937: 2, 3468: 2, 4177: 2, 1264: 2, 1578: 2, 920: 2, 3089: 2, 3263: 2, 1507: 2, 1384: 2, 3307: 2, 3777: 2, 3318: 2, 3256: 2, 922: 2, 3752: 2, 1559: 2, 1313: 1, 1184: 1, 1330: 1, 1404: 1, 985: 1, 561: 1, 1528: 1, 1460: 

*GridSearchCV*

Define the inner evaluation

In [176]:
from sklearn.model_selection import KFold

inner = KFold(n_splits=3, shuffle=True, random_state=42)

SVM with default hyper parameters

In [197]:
import time
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
np.random.seed(42) # For reproducibility

svr = SVR()

# Inner evaluation
start_time = time.time()
inner_eval_default = cross_val_score(svr, X_train, y_train, cv=inner, scoring="neg_mean_squared_error").mean()
default_time = time.time() - start_time

svr.fit(X=X_train, y=y_train)
y_pred = svr.predict(X=X_test)
outer_eval_default = mean_squared_error(y_test, y_pred)

print(f"Inner Evaluation (Default): {inner_eval_default}")
print(f"Outer Evaluation (Default): {outer_eval_default}")
print(f"Time taken for inner evaluation: {default_time} seconds")

#compare to mse_dummy
from sklearn.dummy import DummyRegressor
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict(X_test)
mse_dummy = mean_squared_error(y_test, y_pred_dummy)
print(f"Outer Evaluation (Dummy): {mse_dummy}")





Inner Evaluation (Default): -1512770.6216634128
Outer Evaluation (Default): 1518603.5675881922
Time taken for inner evaluation: 6.541451692581177 seconds
Outer Evaluation (Dummy): 1518656.9525168398


SVM with hyper-parameter tuning



In [203]:
from sklearn.model_selection import GridSearchCV
import time

# Search space
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1]}

hpo_pipe_scale_svc = GridSearchCV(svr, 
                        param_grid,
                        scoring='accuracy',
                        cv=inner, 
                        n_jobs=4, verbose=1)


np.random.seed(42)
start_time=time.time()
hpo_pipe_scale_svc.fit(X=X_train, y=y_train)
gs_time = time.time() - start_time


Fitting 3 folds for each of 16 candidates, totalling 48 fits


Traceback (most recent call last):
  File "/Users/georgefang/Desktop/Machine Learning/UC3M-MLBD/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/georgefang/Desktop/Machine Learning/UC3M-MLBD/venv/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/georgefang/Desktop/Machine Learning/UC3M-MLBD/venv/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/georgefang/Desktop/Machine Learning/UC3M-MLBD/venv/lib/python3.11/site-packages/sklearn/ut

Fitting 3 folds for each of 28 candidates, totalling 84 fits

In [202]:
print(f"Best params: {hpo_pipe_scale_svc.best_params_}")

# Inner evaluation
inner_eval_hpo = hpo_pipe_scale_svc.best_score_

# Outer evaluation
y_pred = hpo_pipe_scale_svc.predict(X=X_test)
outer_eval_hpo = accuracy_score(y_test, y_pred)

Best params: {'C': 0.1, 'gamma': 0.001}


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [181]:
import pandas as pd
evaluations = pd.DataFrame({
    'Default': [inner_eval_default, outer_eval_default, default_time],
    'HPO': [inner_eval_hpo, outer_eval_hpo, gs_time]
}, index=['Inner', 'Outer', 'Time'])
evaluations.name="SVC"

# Displaying the table
evaluations

Unnamed: 0,Default,HPO
Inner,0.001712,0.002283
Outer,0.005708,0.005708
Time,83.164211,1164.360201


In addition to the inner score of the best hyper-parameters, we can display all hyper-parameter combinations evaluated, sorted by inner score. We can see that several hyper-parameter combinations obtain good results.


In [None]:
import pandas as pd

# Convert the cv_results_ dictionary to a pandas DataFrame
results_df = pd.DataFrame(hpo_pipe_scale_svc.cv_results_)

# Select only the relevant columns
results_df = results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

# Sort the DataFrame based on the mean test score in descending order
results_df = results_df.sort_values(by='mean_test_score', ascending=False)

# Print the sorted DataFrame
results_df

In [None]:
import seaborn as sns
results = pd.DataFrame(hpo_pipe_scale_svc.cv_results_)
scores = results.pivot(index='param_SVM__C', columns='param_SVM__gamma', values='mean_test_score')

# Plot heat map
plt.figure(figsize=(9, 7))
sns.heatmap(scores, annot=True, fmt=".3f", cmap="viridis")
plt.title('Grid Search Performance')
plt.xlabel('Gamma')
plt.ylabel('C')
plt.show()

*RandomizedSearchCV*

Only 15 hyper-parameter value combinations will be tried (budget=15), instead of the 7*4=28 tried by grid-search.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Search space
param_grid = {'SVM__C': [0.1, 1, 10, 100, 1000, 10000, 100000],
              'SVM__gamma': [0.001, 0.01, 0.1, 1]}

budget=15
hpo_pipe_scale_svc = RandomizedSearchCV(pipe_scale_svc, 
                        param_grid,
                        scoring='accuracy',
                        cv=inner, 
                        random_state=42,
                        n_iter=budget,
                        n_jobs=4, verbose=1)

start_time = time.time()
hpo_pipe_scale_svc.fit(X=X_train, y=y_train)
rs_time = time.time() - start_time

Fitting 3 folds for each of 15 candidates, totalling 45 fits


In [None]:
print(f"Best params: {hpo_pipe_scale_svc.best_params_}")

# Inner evaluation
inner_eval_hpo = hpo_pipe_scale_svc.best_score_

# Outer evaluation
y_pred = hpo_pipe_scale_svc.predict(X=X_test)
outer_eval_hpo = accuracy_score(y_test, y_pred)

Visualize

In [None]:
import pandas as pd
evaluations = pd.DataFrame({
    'Default': [inner_eval_default, outer_eval_default, default_time],
    'HPO': [inner_eval_hpo, outer_eval_hpo, rs_time]
}, index=['Inner', 'Outer', 'Time'])

# Displaying the table
evaluations

the rest of the code once i have finished HPO

In [138]:
from sklearn.svm import SVR

In [139]:
svr = SVR(kernel='rbf', C=10, gamma=0.2, epsilon=.01)
svr.fit(X_train, y_train)


In [140]:
y_pred = svr.predict(X_test)

In [141]:
from sklearn.dummy import DummyRegressor

In [142]:
dummy_regressor = DummyRegressor(strategy='mean')
dummy_regressor.fit(X_train, y_train)
y_pred_dummy = dummy_regressor.predict(X_test)

In [143]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error






In [144]:
mse = mean_squared_error(y_test, y_pred)
mse_dummy = mean_squared_error(y_test, y_pred_dummy)
print("The model's mse: ", mse)
print("The dummy's mse: ", mse_dummy)
print("Relative error: ", mse/mse_dummy)

The model's mse:  58772329423093.305
The dummy's mse:  58760329672891.875
Relative error:  1.000204215161287


In [145]:
r2 = r2_score(y_test, y_pred)
r2_dummy = r2_score(y_test, y_pred_dummy)
print("The model's r2: ", r2)   
print("The dummy's r2: ", r2_dummy) 

The model's r2:  -0.0002048834800116861
The dummy's r2:  -6.681822717347075e-07


In [146]:
mae = mean_absolute_error(y_test, y_pred)
mae_dummy = mean_absolute_error(y_test, y_pred_dummy)
print("The model's mae: ", mae)
print("The dummy's mae: ", mae_dummy)
print("Relative error: ", mae/mae_dummy)

The model's mae:  6445284.201039434
The dummy's mae:  6444115.528929755
Relative error:  1.0001813549282959
