### Import libraries

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split, cross_validate
from imblearn.over_sampling import SMOTE

from sklearn.metrics import accuracy_score
                                                                                                                                                                                          
from bayes_opt.event import Events
from Utils import newBayesianOptimization, newJSONLogger

In [2]:
# modelos
from sklearn.svm import SVC as SVC

### Import data

In [3]:
data_path = Path().resolve().parent / "magic04.data"
data = pd.read_csv(data_path, header=None)
data.columns=['fLength','fWidth','fSize','fConc','fConc1','fAsym','fM3Long','fM3Trans','fAlpha','fDist','class']

### Preparação dos dados

In [4]:
data = (
    data
    .assign(
        **{"class": lambda x: x['class'].map({'g': 1, 'h': 0}).astype('int8')}
    )
    [['fWidth','fConc1','fAsym','fM3Long','fM3Trans','fAlpha','fDist', 'class']]
)

In [5]:
# train test split
X_train, X_test, Y_train, Y_test = train_test_split(
    data.drop(columns=['class']),
    data[['class']],
    test_size=0.3,
    random_state=0,
    stratify=data['class']
)

In [6]:
# transform power
transformer = PowerTransformer()
transformer.set_output(transform="pandas")
col_to_tran = ['fWidth', 'fConc1', 'fM3Long']
X_train[col_to_tran] = transformer.fit_transform(X_train[col_to_tran])
X_test[col_to_tran] = transformer.transform(X_test[col_to_tran])

In [7]:
# scaler
scaler = StandardScaler()
scaler.set_output(transform="pandas")

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# oversampling
sm = SMOTE(random_state=42)
X_train, Y_train = sm.fit_resample(X_train, Y_train)

In [9]:
Y_train_ravel = Y_train.values.ravel()
Y_test_ravel = Y_test.values.ravel()

### Bayesian Optimization

In [15]:
# parameters
hyperparameters = {
    'kernel' : "rbf",
    'C' : 35.497055,
    'gamma' : 0.148493,
}

model = SVC(**hyperparameters)

cv = (
    train_test_split(np.arange(Y_train.shape[0]) , test_size=0.3, random_state=i, stratify=Y_train)
    for i in [0, 50503, 8254, 12345, 316, 8902, 7822, 8228, 9574, 336]  
)

# cross validation
cv = cross_validate(
    model,
    X_train,
    Y_train_ravel,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)
ac_cv = cv['test_score'].mean()

# fit and predict test
model.fit(X_train, Y_train_ravel)
Y_pred = model.predict(X_test)
ac_test = accuracy_score(Y_test_ravel, Y_pred)

In [17]:
print(f"cross_val accuracy: {ac_cv:.3f}")
print(f"test acuracy: {ac_test:.3f}")

cross_val accuracy: 0.812
test acuracy: 0.818


In [10]:
def opt_bas(C, gamma):
    
    # parameters
    hyperparameters = {
        'kernel' : "rbf",
        'C' : C,
        'gamma' : gamma,
    }

    model = SVC(**hyperparameters)

    cv = (
        train_test_split(np.arange(Y_train.shape[0]) , test_size=0.3, random_state=i, stratify=Y_train)
        for i in [0, 50503, 8254, 12345, 316, 8902, 7822, 8228, 9574, 336]  
    )

    # cross validation
    cv = cross_validate(
        model,
        X_train,
        Y_train_ravel,
        cv=cv,
        scoring='accuracy',
        n_jobs=-1
    )
    ac_cv = cv['test_score'].mean()
    
    # fit and predict test
    model.fit(X_train, Y_train_ravel)
    Y_pred = model.predict(X_test)
    ac_test = accuracy_score(Y_test_ravel, Y_pred)

    return 2*ac_test + ac_cv

### Run optimization

In [11]:
# Bounded region of parameter space
pbounds = {'C': (0.01, 500), 'gamma': (0.001, 5)}

# Bayes optimizer instantiation
optimizer = newBayesianOptimization(
    f=opt_bas, 
    pbounds=pbounds, 
    random_state=1, 
    verbose=2, 
)

# keep data
log_path = Path().resolve() / "Logs" / "svc_resampling_2.jsonl"
logger = newJSONLogger(path = str(log_path))
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

In [12]:
# optimizer.res

In [13]:
# get all the current parameters of the model
optimizer.load_previous(log_path)

In [14]:
optimizer.maximize(init_points=6, n_iter=500)

NotUniqueError: Data point [208.51683213   3.60190214] is not unique. You can set "allow_duplicate_points=True" to avoid this error