In [132]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [133]:
df = pd.read_pickle('../../data/ava_st1_ns4_56.pkl')
df.head


<bound method NDFrame.head of        apcp_sf1_1  apcp_sf2_1 apcp_sf3_1  apcp_sf4_1  apcp_sf5_1  dlwrf_s1_1  \
V1       0.000000         0.0          1    0.000000    0.000000  256.492673   
V2       0.000000         0.0          1    0.017273    0.139091  257.998596   
V3       0.000000         0.0          1    0.000000    0.000000  219.280002   
V4       0.004545         0.0          1    0.000000    0.000000  267.863045   
V5       0.000000         0.0          1    0.000000    0.000000  238.162747   
...           ...         ...        ...         ...         ...         ...   
V4376    0.010909         0.0          1    0.000000    0.000000  278.168651   
V4377    0.000000         0.0          1    0.000000    0.000000  251.551092   
V4378    0.000000         0.0          1    0.000000    0.000000  269.446164   
V4379    0.000000         0.0          1    0.000000    0.000000  268.862049   
V4380    0.000000         0.0          1    0.000000    0.000000  269.112621   

       dl

In [134]:
x = df.drop('energy', axis=1)
y = df['energy']


In [135]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split


In [136]:
preprocessor = ColumnTransformer(
    transformers = [
        ('STscaler', StandardScaler(), make_column_selector(dtype_include=np.float64)),
        ('MMscaler', MinMaxScaler(), make_column_selector(dtype_include=np.int64))
    ],
    remainder = 'passthrough'
)

preprocessor.fit(x)

X =  preprocessor.transform(x)



In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [138]:
from sklearn.svm import SVR

In [139]:
svr = SVR(kernel='rbf', C=10, gamma=0.2, epsilon=.01)
svr.fit(X_train, y_train)


In [140]:
y_pred = svr.predict(X_test)

In [141]:
from sklearn.dummy import DummyRegressor

In [142]:
dummy_regressor = DummyRegressor(strategy='mean')
dummy_regressor.fit(X_train, y_train)
y_pred_dummy = dummy_regressor.predict(X_test)

In [143]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error






In [144]:
mse = mean_squared_error(y_test, y_pred)
mse_dummy = mean_squared_error(y_test, y_pred_dummy)
print("The model's mse: ", mse)
print("The dummy's mse: ", mse_dummy)
print("Relative error: ", mse/mse_dummy)

The model's mse:  58772329423093.305
The dummy's mse:  58760329672891.875
Relative error:  1.000204215161287


In [145]:
r2 = r2_score(y_test, y_pred)
r2_dummy = r2_score(y_test, y_pred_dummy)
print("The model's r2: ", r2)   
print("The dummy's r2: ", r2_dummy) 

The model's r2:  -0.0002048834800116861
The dummy's r2:  -6.681822717347075e-07


In [146]:
mae = mean_absolute_error(y_test, y_pred)
mae_dummy = mean_absolute_error(y_test, y_pred_dummy)
print("The model's mae: ", mae)
print("The dummy's mae: ", mae_dummy)
print("Relative error: ", mae/mae_dummy)

The model's mae:  6445284.201039434
The dummy's mae:  6444115.528929755
Relative error:  1.0001813549282959


**Hyperparameter tuning for SVM using GridSearchCV**

In [131]:
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np

# Load data
df = pd.read_pickle('../../data/ava_st1_ns4_56.pkl')

# Establish inputs and output columns.
x = df.drop('energy', axis=1)
y = df['energy']

# Preprocess data
preprocessor = ColumnTransformer(
    transformers=[
        ('STscaler', StandardScaler(), make_column_selector(dtype_include=np.float64)),
        ('MMscaler', MinMaxScaler(), make_column_selector(dtype_include=np.int64))
    ],
    remainder='passthrough'
)

preprocessor.fit(x)
X = preprocessor.transform(x)

# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter grid for SVR
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'epsilon': [0.1, 0.2, 0.5]
}

# Create SVR model
svr = SVR(kernel='rbf')

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model for predictions
best_svr = grid_search.best_estimator_
y_pred = best_svr.predict(X_test)

# Evaluate the performance
mae = mean_absolute_error(y_test, y_pred)
print("The model's MAE: ", mae)


Best Hyperparameters: {'C': 10, 'epsilon': 0.2, 'gamma': 0.01}
The model's MAE:  6762394.166157158


**Hyperparameter tuning for SVM using RandomSearchCV**

In [148]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# Split the data into training and testing sets using the X from above
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the hyperparameter search space
param_distributions = {
    'C': (0.1,1, 10, 100, 1000),  
    'gamma': (1,0.1,0.01,0.001, 0.0001),  
}
inner = KFold(n_splits=3, shuffle=True, random_state=42)

# Create SVR model
svr = SVR(kernel='rbf')

# Perform randomized search for hyperparameter tuning
randomized_search = RandomizedSearchCV(
    svr, param_distributions, n_iter=50, cv=inner, scoring='neg_mean_absolute_error', n_jobs=4, random_state=42
)

randomized_search.fit(X_train, y_train)

# Get the best hyperparameters from the randomized search
best_params = randomized_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model for predictions
best_svr = randomized_search.best_estimator_
y_pred = best_svr.predict(X_test)

# Evaluate the performance
mae = mean_absolute_error(y_test, y_pred)
print("The model's MAE: ", mae)


Best Hyperparameters: {'gamma': 0.001, 'epsilon': 0.5, 'C': 1000}
The model's MAE:  6537505.264877887


**Hyperparameter tunning with RandomizedSearchCV**

In [149]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


# This is the preprocessing pipeline: SVMs need scaling
scaler = StandardScaler()
# We use gausian / radial kernel
svc = SVC(random_state=42, kernel="rbf")

# This is the preprocessing pipeline: SVMs need scaling
pipe_scale_svc = Pipeline([
    ('scale', scaler),
    ('SVM', svc)])

15 hyper-parameter value combinations will be tried

In [150]:
from sklearn.model_selection import RandomizedSearchCV
import time

# Search space
param_grid = {'SVM__C': [0.1, 1, 10, 100, 1000, 10000, 100000],
              'SVM__gamma': [0.001, 0.01, 0.1, 1]}

budget=5 # next do 15
hpo_pipe_scale_svc = RandomizedSearchCV(pipe_scale_svc, 
                        param_grid,
                        scoring='accuracy',
                        cv=inner, 
                        random_state=42,
                        n_iter=budget,
                        n_jobs=4, verbose=1)

start_time = time.time()
hpo_pipe_scale_svc.fit(X=X_train, y=y_train)
rs_time = time.time() - start_time

NameError: name 'inner' is not defined

Fitting 3 folds for each of 15 candidates, totalling 45 fits

In [None]:
print(f"Best params: {hpo_pipe_scale_svc.best_params_}")

# Inner evaluation
inner_eval_hpo = hpo_pipe_scale_svc.best_score_

# Outer evaluation
y_pred = hpo_pipe_scale_svc.predict(X=X_test)
outer_eval_hpo = accuracy_score(y_test, y_pred)


In [None]:
import pandas as pd
evaluations = pd.DataFrame({
    'Default': [inner_eval_default, outer_eval_default, default_time],
    'HPO': [inner_eval_hpo, outer_eval_hpo, rs_time]
}, index=['Inner', 'Outer', 'Time'])

# Displaying the table
evaluations