In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

# custom classes for cross validation and scoring
import cross_validation
import performance_metrics

from pickle import load
from sklearn import svm, metrics
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.decomposition import PCA
from skopt import BayesSearchCV
import keras_tuner as kt

In [3]:
data = pd.read_csv(r"../data/processed_data/complete_data.csv", index_col=0)


In [4]:
x = data.drop(["TARGET"], axis=1)
y = data.filter(["TARGET"], axis=1)

scaler = load(open(r"scaler/scaler.pkl", "rb"))
x_scaled = scaler.transform(x)
y = y.values.reshape(y.shape[0])

x_scaled.shape, y.shape


((565372, 202), (565372,))

In [5]:
# for the SVM there is too much data
pca = PCA()
x_reduced = pca.fit(x_scaled)

explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(explained_variance_ratio >= 0.8) + 1

# reduced component that retains 80% of the data
x_reduced = PCA(n_components=n_components).fit_transform(x_scaled)

In [6]:
split = 0.5

hp_x, hp_y = x_reduced[:int(len(x_reduced) * split)], y[:int(len(y) * split)] # hyperparameter set
kf_x, kf_y = x_reduced[int(len(x_reduced) * split):], y[int(len(y) * split):] # kfold set

hp_x.shape, hp_y.shape, kf_x.shape, kf_y.shape

((282686, 133), (282686,), (282686, 133), (282686,))

In [7]:
def model_builder(hp):
    model = svm.SVC(
        C=hp.Choice("c", [0.1, 1.0, 10.0]),
        kernel=hp.Choice("kernel", ['linear', 'rbf']),
        gamma=hp.Choice("gamma", [0.1, 1.0, 10.0]),
        #degree=hp.Int("degree", 0, 3, step=1)
    )
    
    return model

In [12]:
project_name = "svm_v2"

tuner = kt.tuners.SklearnTuner(
    oracle=kt.oracles.BayesianOptimizationOracle(
        objective=kt.Objective("score", "max"),
        max_trials=10
    ),
    hypermodel=model_builder,
    scoring=metrics.make_scorer(metrics.accuracy_score),
    cv=KFold(5, shuffle=True, random_state=7430),
    directory=f"../logs/svm/{project_name}",
    project_name=project_name
)

In [None]:
tuner.search(hp_x[:10000], hp_y[:10000])


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
10                |10                |c
rbf               |rbf               |kernel
1                 |1                 |gamma



In [None]:
# implement KT for tuning

param_space = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.1, 1, 10],
    'degree': [2, 3, 4]
}

model = svm.SVC()

search = GridSearchCV(model, param_space, cv=2, n_jobs=-1, verbose=2)
search.fit(hp_x, hp_y)

Fitting 2 folds for each of 54 candidates, totalling 108 fits


In [None]:
# save the model architecture 
best_params = search.best_params_
with open(f"svm/hyperparameters-{project_name}.json", "w") as json_file:
    json.dump(best_params, json_file)
    
best_params

In [None]:
# load the best hyperparameters
with open(f"svm/hyperparameters-{project_name}.json", "r") as json_file:
    load_hyperparameters = json.load(json_file)
    
loaded_model = svm.SVC(**load_hyperparameters)

kfold = KFold(n_splits=5, shuffle=True, random_state=663)

# run cross validation for the model
cv = cross_validation.cross_val(kf_x, kf_y)
results, confusion_matrix = cv.run_validation(kfold=kfold, model=loaded_model)

In [None]:
# save to csv for easy viewing
results.to_csv(f"../results/svm/scores-{project_name)}.csv")

results.round(4) * 100

In [None]:
# save confusion matrix 
cm_df = pd.DataFrame(confusion_matrix)
cm_df.to_csv(f"../results/svm/cmdata-{project_name}.csv")

In [None]:
fig = sns.heatmap(confusion_matrix, annot=True, cmap='Blues', fmt='d')

# Set labels, title, and axis ticks
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Support Vector Machine Confusion Matrix')

tick_labels = ['Default', 'Non-default']
tick_positions = [0, 1]
plt.xticks(ticks=[pos + 0.5 for pos in tick_positions], labels=tick_labels)
plt.yticks(ticks=[pos + 0.5 for pos in tick_positions], labels=tick_labels)

# Show the plot
plt.show()

plt.tight_layout()

graph = fig.get_figure()
graph.savefig(f"../results/svm/cm-{project_name}.png")