# Support Vector Machines 
### with Hyperparameter Tuning

### Hinweis:
Anders als in den Praktika und Vorlesungen verwenden wir hier nicht `Scikit-learn`, sondern die Bibliothek `cuML` (CUDA Machine Learning). Diese ermöglicht es SVMs auf der GPU zu trainieren, was den Trainingsprozess extrem beschleunigt. An der Herangehensweise und der Art, wie wir die Hyperparameter tunen, ändert sich dadurch nichts.

In [1]:
import cudf
import numpy as np
import time
import joblib

from IPython.display import HTML
from Model_save import save_model_as_sklearn
from cuml.svm import SVC
from cuml.metrics import accuracy_score
from cuml.model_selection import train_test_split
from Transformer_cudf import ReplaceZeroWithMean
from cuml.preprocessing import StandardScaler
from sklearn.model_selection import ParameterSampler, ParameterGrid
from sklearn.model_selection import KFold, LeaveOneOut
from cuml.model_selection import StratifiedKFold


DATAPATH = '../Data/'
MODELPATH = '../Data/Models/SVM/'


data = cudf.read_csv(f"{DATAPATH}/diabetes.csv")

X = data.drop("Outcome", axis=1)
y = data["Outcome"]

train_set, temp_set, train_labels, temp_labels = train_test_split(X, y, train_size=0.60, random_state=42)
test_set, valid_set, test_labels, valid_labels = train_test_split(temp_set, temp_labels, train_size=0.50, random_state=42)

imputer = ReplaceZeroWithMean(["Glucose", "BloodPressure", "SkinThickness", "BMI"])
train_set = imputer.fit_transform(train_set)
valid_set = imputer.transform(valid_set)
test_set = imputer.transform(test_set)

scaler = StandardScaler()
train_set = scaler.fit_transform(train_set)
valid_set = scaler.transform(valid_set)
test_set = scaler.transform(test_set)

def model_data_print(parameters, score, accuracy, timeStart, timeEnd):

    html_output = """
    <h3>Best Parameters Found</h3>
    <ul>
    """

    for param, value in parameters.items():
        html_output += f"<li>{param}: {value}</li>"

    html_output += f"""
    </ul>
    <h3>Model Performance:</h3>
    <ul>
        <li>Validation Accuracy: <b>{score*100:.2f}%</b></li>
        <li>Test Accuracy: <b>{accuracy*100:.2f}%</b></li>
    </ul>
    <p>Time taken: <b>{timeStart - timeEnd:.2f} .Sec</b></p>
    """

    return HTML(html_output)


ModuleNotFoundError: No module named 'cudf'

# WITHOUT Hyperparameter Tuning

In [3]:
clf = SVC()

clf.fit(train_set, train_labels)

test_pred = clf.predict(test_set)
accuracy = accuracy_score(test_labels, test_pred)

joblib.dump(clf, MODELPATH + '01_SVM_no_hyper.pkl')
save_model_as_sklearn(filename='01_SVM_no_hyper_sklearn.pkl', parameters={})

html_output =f"""
<h3>SVM without hyperparameter tuning</h3>
Accuracy: <b>{accuracy*100:.2f}%</b>
"""
HTML(html_output)

# With (Random) Hyperparameter Tuning


In [2]:
param_distributions = [
    {
        'kernel': ['linear'],
        'C': np.logspace(np.log10(0.001), np.log10(1000), 7, endpoint=True), 
        'tol': np.logspace(np.log10(0.0000001), np.log10(0.1), 7, endpoint=True),
        'max_iter': [1000, 2000, 3000, 4000, 5000],
        'class_weight': ['balanced']
    },
    {
        'kernel': ['poly'],
        'C': np.logspace(np.log10(0.001), np.log10(1000), 7, endpoint=True), 
        'tol': np.logspace(np.log10(0.0000001), np.log10(0.1), 7, endpoint=True),
        'gamma': np.logspace(np.log10(0.001), np.log10(1000), 7, endpoint=True),
        'degree': [2, 3, 4, 5, 6], # Macht natrülich nur Sinn, wenn degree > 1 da sonst linear
        'max_iter': [1000, 2000, 3000, 4000, 5000],
        'class_weight': ['balanced']
    },
    {
        'kernel': ['rbf'],
        'C': np.logspace(np.log10(0.001), np.log10(1000), 7, endpoint=True),
        'tol': np.logspace(np.log10(0.0000001), np.log10(0.1), 7, endpoint=True),
        'gamma': np.logspace(np.log10(0.001), np.log10(1000), 7, endpoint=True),
        'max_iter': [1000, 2000, 3000, 4000, 5000],
        'class_weight': ['balanced']
    },
    {
        'kernel': ['sigmoid'],
        'C': np.logspace(np.log10(0.001), np.log10(1000), 7, endpoint=True),
        'tol': np.logspace(np.log10(0.0000001), np.log10(0.1), 7, endpoint=True),
        'gamma': np.logspace(np.log10(0.001), np.log10(1000), 7, endpoint=True),
        'coef0': np.linspace(-1, 1, 10),
        'max_iter': [1000, 2000, 3000, 4000, 5000],
        'class_weight': ['balanced']
    }
]

In [5]:
n_iter = 100

best_accuracy = 0
best_params = None

start_time = time.time()

all_sampled_params = []
for params in param_distributions:
    sampler = ParameterSampler(params, n_iter=n_iter, random_state=42)
    sampled_params = list(sampler)
    all_sampled_params.extend(sampled_params)

clf = SVC(random_state=42)

for params in all_sampled_params:
    try:
        clf.set_params(**params)
        clf.fit(train_set, train_labels)
        predictions = clf.predict(valid_set)
        accuracy = accuracy_score(valid_labels, predictions)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params
    except ValueError as e:
        print(f"Skipping invalid parameter combination: {params}")
        print(f"Error: {e}")


end_time = time.time()

best_model = SVC(**best_params, random_state=42)
best_model.fit(train_set, train_labels) 
test_predictions = best_model.predict(test_set)
test_accuracy = accuracy_score(test_labels, test_predictions)

joblib.dump(best_model, MODELPATH + '02_SVM_para_sampl.pkl')
save_model_as_sklearn(filename='02_SVM_para_sampl_sklearn.pkl', parameters=best_params)

In [6]:
model_data_print(best_params, best_accuracy, test_accuracy, end_time, start_time)

# Further Hyperparameter Tuning with Parameter Grid

In [4]:
param_grid = {
        'kernel': ['poly'],
        'C': [0.8, 0.9, 1, 1.1, 1.2, 2], 
        'tol': [0.001, 0.01, 0.1],
        'gamma': [0.08, 0.09, 0.1, 0.2, 0.3],
        'degree': [4, 5, 6],
        'max_iter': np.linspace(2000, 3000, 10),
        'class_weight': ['balanced']
    }

clf = SVC(random_state=42)

best_accuracy = 0
best_params = None

start_time = time.time()

for params in ParameterGrid(param_grid):
    clf.set_params(**params)
    clf.fit(train_set, train_labels)
    predictions = clf.predict(valid_set)
    accuracy = accuracy_score(valid_labels, predictions)

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params    

end_time = time.time()

best_model = SVC(**best_params, random_state=42)
best_model.fit(train_set, train_labels)
test_predictions = best_model.predict(test_set)

test_accuracy = accuracy_score(test_labels, test_predictions)

joblib.dump(best_model, MODELPATH + '03_SVM_para_grid.pkl')
save_model_as_sklearn(filename='03_SVM_para_grid_sklearn.pkl', parameters=best_params)

[W] [19:13:39.641503] SVC with the linear kernel can be much faster using the specialized solver provided by LinearSVC. Consider switching to LinearSVC if tranining takes too long.


In [5]:
model_data_print(best_params, best_accuracy, test_accuracy, end_time, start_time)

# Cross Validation methods:

1. **Holdout:** Split data into training and testing sets. Fast but high variance.

2. **k-Fold: Divide data** into k folds, train on k-1, test on 1, repeat k times. Reliable but slower.
3. **Stratified k-Fold:** Like k-fold, but keeps class ratios in each fold. Good for imbalanced data.
4. **Leave-One-Out (LOOCV):** Each sample is a test set once. Unbiased but very slow for large datasets.
5. **Leave-p-Out (LpOCV):** Leave p samples out for testing each time. Very thorough but extremely slow.
6. **Repeated Random Sub-sampling:** Randomly split data multiple times, train and test. Flexible but can have high variance.
7. **Nested Cross-Validation:** Outer loop for evaluation, inner loop for hyperparameter tuning. Reliable but very computationally expensive.

# (Random) Hyperparameter Tuning with Cross-Validation

In [3]:
data = cudf.read_csv(f"{DATAPATH}/diabetes.csv")

X = data.drop("Outcome", axis=1)
y = data["Outcome"]

train_set, test_set, train_labels, test_labels = train_test_split(X, y, train_size=0.80, random_state=42)

imputer = ReplaceZeroWithMean(["Glucose", "BloodPressure", "SkinThickness", "BMI"])
train_set = imputer.fit_transform(train_set)
test_set = imputer.transform(test_set)

scaler = StandardScaler()
train_set = scaler.fit_transform(train_set)
test_set = scaler.transform(test_set)

In [4]:
class CuMLRandomizedSearchCV:

    def __init__(self, model, param_distributions, n_iter=10, cv=None, random_state=42, verbose=False):
        """
        Initialize the RandomizedSearchCV.

        Args:
            model (object): The machine learning model to tune. Must have `fit` and `score` methods.
            param_distributions (dict): Dictionary of parameter distributions to sample from.
            n_iter (int): Number of parameter settings that are sampled.
            cv (object): Cross-validation splitting strategy.
                         Should be one of KFold, StratifiedKFold, or LeaveOneOut.
                         If None, defaults to 5-fold KFold.
            random_state (int): Random state for reproducibility.
        """
        self.model = model
        self.param_distributions = param_distributions
        self.n_iter = n_iter
        self.cv = cv if cv is not None else KFold(n_splits=5, shuffle=True, random_state=random_state)
        self.random_state = random_state
        self.verbose = verbose
        self.results = []
        self.best_params = None
        self.best_score = None

    def _sample_parameters(self):
        """Sample parameter combinations from the given distributions."""
        for param_distribution in self.param_distributions:
            sampler = ParameterSampler(param_distribution, n_iter=self.n_iter, random_state=self.random_state)
            for sampled_params in sampler:
                yield sampled_params

    def _train_and_evaluate(self, X, y, sampled_params):
        """Train and evaluate a model with given parameters using the chosen CV method."""
        scores = []
        if isinstance(self.cv, LeaveOneOut):
            for i, (train_index, val_index) in enumerate(self.cv.split(X, y)):
                if self.verbose:
                    print(f" Training and evaluating sample {i+1}/{len(X)}...")
                X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y.iloc[train_index], y.iloc[val_index]

                self.model.set_params(**sampled_params)
                self.model.fit(X_train, y_train)
                score = self.model.score(X_val, y_val)
                scores.append(score)
        else:
            for fold, (train_index, val_index) in enumerate(self.cv.split(X, y)):
                if self.verbose:
                    print(f" Training and evaluating fold {fold+1}/{self.cv.get_n_splits()}...")
                X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y.iloc[train_index], y.iloc[val_index]

                self.model.set_params(**sampled_params)
                self.model.fit(X_train, y_train)
                score = self.model.score(X_val, y_val)
                scores.append(score)

        return np.mean(scores)

    def fit(self, X, y):
        """Fit the randomized search CV to the data."""
        print("Starting Randomized Search CV...")
        for i, sampled_params in enumerate(self._sample_parameters()):
            if self.verbose:
                print(f"Evaluating parameter set {i+1}/{self.n_iter * len(self.param_distributions)}: {sampled_params}")
            else:
                print(f"Evaluating parameter set {i+1}/{self.n_iter * len(self.param_distributions)}")
            avg_score = self._train_and_evaluate(X, y, sampled_params)
            print(f"  Average CV score: {avg_score:.4f}")
            self.results.append({'params': sampled_params, 'score': avg_score})

        best_result = max(self.results, key=lambda x: x['score'])
        self.best_params = best_result['params']
        self.best_score = best_result['score']
        self.X_train = X
        self.y_train = y
        print("Randomized Search CV complete.")
        print(f"Best parameters found: {self.best_params}")
        print(f"Best CV score: {self.best_score:.4f}")
        return self

    def score(self, X, y):
        """Score the model's performance."""
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred)

    def get_best_params(self):
        """Return the best parameters found by the randomized search."""
        return self.best_params

    def get_best_score(self):
        """Return the best score found by the randomized search."""
        return self.best_score

    def get_results(self):
        """Return the results of the randomized search."""
        return self.results

    def get_best_model(self):
        """Return the best model found by the randomized search."""
        return self.model.set_params(**self.best_params)
    

In [5]:
N_ITER = 20
N_SPLITS = 5
RND_STATE = 42

svc_rnd_cv = SVC()

fold_strategy_kfold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RND_STATE)
fold_strategy_strat = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RND_STATE)
fold_strategy_loocv = LeaveOneOut()

searcher_kfold = CuMLRandomizedSearchCV(
    model=svc_rnd_cv, 
    param_distributions=param_distributions, 
    n_iter=N_ITER, 
    cv=fold_strategy_kfold, 
    random_state=RND_STATE,
    )

searcher_strat = CuMLRandomizedSearchCV(
    model=svc_rnd_cv,
    param_distributions=param_distributions,
    n_iter=20,
    cv=fold_strategy_strat,
    random_state=RND_STATE,
    )

searcher_loocv = CuMLRandomizedSearchCV(
    model=svc_rnd_cv,
    param_distributions=param_distributions,
    cv=fold_strategy_loocv,
    random_state=RND_STATE,
    verbose=True
    )

# Hyperparameter Tuning with KFold

In [6]:
time_start_kfold = time.time()
searcher_kfold.fit(train_set, train_labels)
time_end_kfold = time.time()

kfold_best_params_rnd_cv = searcher_kfold.get_best_params()
kfold_best_score_rnd_cv = searcher_kfold.get_best_score()
kfold_best_model_rnd_cv = searcher_kfold.get_best_model()

kfold_best_model_rnd_cv.fit(train_set, train_labels)
kfold_test_predictions_rnd_cv = kfold_best_model_rnd_cv.predict(test_set)
kfold_test_accuracy_rnd_cv = accuracy_score(test_labels, kfold_test_predictions_rnd_cv)

joblib.dump(kfold_best_model_rnd_cv, MODELPATH + '04_SVM_kfold_rnd_cv.pkl')
save_model_as_sklearn(filename='04_SVM_kfold_rnd_cv_sklearn.pkl', parameters=kfold_best_params_rnd_cv)

Starting Randomized Search CV...
Evaluating parameter set 1/80
  Average CV score: 0.4936
Evaluating parameter set 2/80
  Average CV score: 0.4936
Evaluating parameter set 3/80
  Average CV score: 0.7329
Evaluating parameter set 4/80
  Average CV score: 0.6727
Evaluating parameter set 5/80
  Average CV score: 0.6662
Evaluating parameter set 6/80
  Average CV score: 0.6662
Evaluating parameter set 7/80
  Average CV score: 0.6662
Evaluating parameter set 8/80
  Average CV score: 0.7606
Evaluating parameter set 9/80
  Average CV score: 0.4936
Evaluating parameter set 10/80
  Average CV score: 0.7134
Evaluating parameter set 11/80
  Average CV score: 0.5538
Evaluating parameter set 12/80
  Average CV score: 0.6727
Evaluating parameter set 13/80
  Average CV score: 0.7606
Evaluating parameter set 14/80
  Average CV score: 0.7280
Evaluating parameter set 15/80
  Average CV score: 0.7134
Evaluating parameter set 16/80
  Average CV score: 0.7329
Evaluating parameter set 17/80
  Average CV scor

In [8]:
model_data_print(kfold_best_params_rnd_cv, kfold_best_score_rnd_cv, kfold_test_accuracy_rnd_cv, time_end_kfold, time_start_kfold)

# Hyperparameter Tuning with  Stratified K Fold

In [9]:
time_start_strat = time.time()
searcher_strat.fit(train_set, train_labels)
time_end_strat = time.time()

strat_best_params_rnd_cv = searcher_strat.get_best_params()
strat_best_score_rnd_cv = searcher_strat.get_best_score()
strat_best_model_rnd_cv = searcher_strat.get_best_model()

strat_best_model_rnd_cv.fit(train_set, train_labels)
strat_test_predictions_rnd_cv = strat_best_model_rnd_cv.predict(test_set)
strat_test_accuracy_rnd_cv = accuracy_score(test_labels, strat_test_predictions_rnd_cv)

joblib.dump(strat_best_model_rnd_cv, MODELPATH + '05_SVM_strat_rnd_cv.pkl')
save_model_as_sklearn(filename='05_SVM_strat_rnd_cv_sklearn.pkl', parameters=strat_best_params_rnd_cv)

Starting Randomized Search CV...
Evaluating parameter set 1/80
  Average CV score: 0.6401
Evaluating parameter set 2/80
  Average CV score: 0.6401
Evaluating parameter set 3/80
  Average CV score: 0.7393
Evaluating parameter set 4/80
  Average CV score: 0.6775
Evaluating parameter set 5/80
  Average CV score: 0.6937
Evaluating parameter set 6/80
  Average CV score: 0.6937
Evaluating parameter set 7/80
  Average CV score: 0.6937
Evaluating parameter set 8/80
  Average CV score: 0.7492
Evaluating parameter set 9/80
  Average CV score: 0.6401
Evaluating parameter set 10/80
  Average CV score: 0.7231
Evaluating parameter set 11/80
  Average CV score: 0.3665
Evaluating parameter set 12/80
  Average CV score: 0.6775
Evaluating parameter set 13/80
  Average CV score: 0.7492
Evaluating parameter set 14/80
  Average CV score: 0.7393
Evaluating parameter set 15/80
  Average CV score: 0.7231
Evaluating parameter set 16/80
  Average CV score: 0.7393
Evaluating parameter set 17/80
  Average CV scor



In [10]:
model_data_print(strat_best_params_rnd_cv, strat_best_score_rnd_cv, strat_test_accuracy_rnd_cv, time_end_strat, time_start_strat)

# Hyperparameter Tuning with Leave one out

In [11]:
time_start_loo = time.time()
searcher_loocv.fit(train_set, train_labels)
time_end_loo = time.time()

loocv_best_params_rnd_cv = searcher_loocv.get_best_params()
loocv_best_score_rnd_cv = searcher_loocv.get_best_score()
loocv_best_model_rnd_cv = searcher_loocv.get_best_model()

loocv_best_model_rnd_cv.fit(train_set, train_labels)
loocv_test_predictions_rnd_cv = loocv_best_model_rnd_cv.predict(test_set)
loocv_test_accuracy_rnd_cv = accuracy_score(test_labels, loocv_test_predictions_rnd_cv)

joblib.dump(loocv_best_model_rnd_cv, MODELPATH + '06_SVM_loocv_rnd_cv.pkl')
save_model_as_sklearn(filename='06_SVM_loocv_rnd_cv_sklearn.pkl', parameters=loocv_best_params_rnd_cv)

Starting Randomized Search CV...
Evaluating parameter set 1/40: {'tol': np.float64(0.0001), 'max_iter': 4000, 'kernel': 'linear', 'class_weight': 'balanced', 'C': np.float64(0.001)}
 Training and evaluating sample 1/614...
 Training and evaluating sample 2/614...
 Training and evaluating sample 3/614...
 Training and evaluating sample 4/614...
 Training and evaluating sample 5/614...
 Training and evaluating sample 6/614...
 Training and evaluating sample 7/614...
 Training and evaluating sample 8/614...
 Training and evaluating sample 9/614...
 Training and evaluating sample 10/614...
 Training and evaluating sample 11/614...
 Training and evaluating sample 12/614...
 Training and evaluating sample 13/614...
 Training and evaluating sample 14/614...
 Training and evaluating sample 15/614...
 Training and evaluating sample 16/614...
 Training and evaluating sample 17/614...
 Training and evaluating sample 18/614...
 Training and evaluating sample 19/614...
 Training and evaluating samp

In [12]:
model_data_print(loocv_best_params_rnd_cv, loocv_best_score_rnd_cv, loocv_test_accuracy_rnd_cv, time_end_loo, time_start_loo)

# Further Tuning with GridSearchCV

In [13]:
param_gridCV = {
        'kernel': ['linear'],
        'C': [0.8, 0.9, 1, 1.1, 1.2, 2], 
        'tol': [0.01, 0.01, 0.1],
        'max_iter': np.linspace(2500, 3500, 10),
        'class_weight': ['balanced'],
    }

In [14]:
class CuMLGridSearchCV:

    def __init__(self, model, param_grid, cv=None, random_state=42, verbose=False):
        """
        Initialize the GridSearchCV.

        Args:
        model (object): The machine learning model to tune. Must have `fit` and `score` methods.
        param_grid (dict): Dictionary with parameters names (str) as keys and lists of parameter settings to try as values.
        cv (object): Cross-validation splitting strategy.
                     Should be one of KFold, StratifiedKFold, or LeaveOneOut.
                     If None, defaults to 5-fold KFold.
        random_state (int): Random state for reproducibility.
        verbose (bool): Whether to print progress messages. Defaults to False.
        
        """
        self.model = model
        self.param_grid = param_grid
        self.cv = cv if cv is not None else KFold(n_splits=5, shuffle=True, random_state=random_state)
        self.random_state = random_state
        self.verbose = verbose
        self.results = []
        self.best_params = None
        self.best_score = None

    def _parameter_comb(self):
        """Sample parameter combinations from the given distributions."""
        for params in ParameterGrid(self.param_grid):
            yield params

    def _train_and_evaluate(self, X, y, param_comb):
        """Train and evaluate a model with given parameters using the chosen CV method."""
        scores = []
        if isinstance(self.cv, LeaveOneOut):
            for i, (train_index, val_index) in enumerate(self.cv.split(X, y)):
                if self.verbose:
                    print(f" Training and evaluating sample {i+1}/{len(X)}...")
                X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y.iloc[train_index], y.iloc[val_index]

                self.model.set_params(**param_comb)
                self.model.fit(X_train, y_train)
                score = self.model.score(X_val, y_val)
                scores.append(score)
        else:
            for fold, (train_index, val_index) in enumerate(self.cv.split(X, y)):
                if self.verbose:
                    print(f" Training and evaluating fold {fold+1}/{self.cv.get_n_splits(X, y)}...")
                X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y.iloc[train_index], y.iloc[val_index]

                self.model.set_params(**param_comb)
                self.model.fit(X_train, y_train)
                score = self.model.score(X_val, y_val)
                scores.append(score)

        return np.mean(scores)

    def fit(self, X, y):
        """Fit the parameter grid to the data."""
        print("Starting Grid Search CV...")
        for i, param_comb in enumerate(self._parameter_comb()):
            if self.verbose:
                print(f"Processing parameter combination {i+1}/{len(list(ParameterGrid(self.param_grid)))}: {param_comb}")
            else:
                print(f"Processing parameter combination {i+1}/{len(list(ParameterGrid(self.param_grid)))}...")
            avg_score = self._train_and_evaluate(X, y, param_comb)
            print(f"Average CV score: {avg_score:.4f}")
            self.results.append({'params': param_comb, 'score': avg_score})

        best_result = max(self.results, key=lambda x: x['score'])
        self.best_params = best_result['params']
        self.best_score = best_result['score']
        self.X_train = X
        self.y_train = y
        print("Randomized Search CV complete.")
        print(f"Best parameters found: {self.best_params}")
        print(f"Best CV score: {self.best_score:.4f}")
        return self

    def predict(self, train_set, train_labels, test_set):
        """Predict using the best found parameters."""
        print("Training final model with best parameters...")
        final_model = self.model.set_params(**self.best_params)
        final_model.fit(train_set, train_labels)
        print("Final model training complete.")
        return final_model.predict(test_set)

    def score(self, X, y):
        """Score the model's performance."""
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred)

    def get_best_params(self):
        """Return the best parameters found by the grid search."""
        return self.best_params

    def get_best_score(self):
        """Return the best score found by the grid search."""
        return self.best_score

    def get_results(self):
        """Return the results of the grid search."""
        return self.results

    def get_best_model(self):
        """Return the best model found by the grid search."""
        return self.model.set_params(**self.best_params)

In [15]:
N_ITER = 20
N_SPLITS = 5
RND_STATE = 42

svc_rnd_cv = SVC()

fold_strategy_kfold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RND_STATE)
fold_strategy_strat = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RND_STATE)
fold_strategy_loocv = LeaveOneOut()

searcher_kfold_grid = CuMLGridSearchCV(
    model=svc_rnd_cv, 
    param_grid=param_gridCV, 
    cv=fold_strategy_kfold, 
    random_state=RND_STATE,
    )

searcher_strat_grid = CuMLGridSearchCV(
    model=svc_rnd_cv,
    param_grid=param_gridCV,
    cv=fold_strategy_strat,
    random_state=RND_STATE,
    )

searcher_loocv_grid = CuMLGridSearchCV(
    model=svc_rnd_cv,
    param_grid=param_gridCV,
    cv=fold_strategy_loocv,
    random_state=RND_STATE,
    verbose=True
    )

In [16]:
time_start_kfold_grid = time.time()
searcher_kfold_grid.fit(train_set, train_labels)
time_end_kfold_grid = time.time()

kfold_best_params_grid_cv = searcher_kfold_grid.get_best_params()
kfold_best_score_grid_cv = searcher_kfold_grid.get_best_score()
kfold_best_model_grid_cv = searcher_kfold_grid.get_best_model()

kfold_best_model_grid_cv.fit(train_set, train_labels)
kfold_test_predictions_grid_cv = kfold_best_model_grid_cv.predict(test_set)
kfold_test_accuracy_grid_cv = accuracy_score(test_labels, kfold_test_predictions_grid_cv)

joblib.dump(kfold_best_model_grid_cv, MODELPATH + '07_SVM_kfold_grid_cv.pkl')
save_model_as_sklearn(filename='07_SVM_kfold_grid_cv_sklearn.pkl', parameters=kfold_best_params_grid_cv)

Starting Grid Search CV...
Processing parameter combination 1/180...
Average CV score: 0.7541
Processing parameter combination 2/180...
Average CV score: 0.7541
Processing parameter combination 3/180...
Average CV score: 0.7590
Processing parameter combination 4/180...
Average CV score: 0.7541
Processing parameter combination 5/180...
Average CV score: 0.7541
Processing parameter combination 6/180...
Average CV score: 0.7590
Processing parameter combination 7/180...
Average CV score: 0.7541
Processing parameter combination 8/180...
Average CV score: 0.7541
Processing parameter combination 9/180...
Average CV score: 0.7590
Processing parameter combination 10/180...
Average CV score: 0.7541
Processing parameter combination 11/180...
Average CV score: 0.7541
Processing parameter combination 12/180...
Average CV score: 0.7590
Processing parameter combination 13/180...
Average CV score: 0.7541
Processing parameter combination 14/180...
Average CV score: 0.7541
Processing parameter combinati

In [17]:
model_data_print(kfold_best_params_grid_cv, kfold_best_score_grid_cv, kfold_test_accuracy_grid_cv, time_end_kfold_grid, time_start_kfold_grid)

In [18]:
time_start_strat_grid = time.time()
searcher_strat_grid.fit(train_set, train_labels)
time_end_strat_grid = time.time()

strat_best_params_grid_cv = searcher_strat_grid.get_best_params()
strat_best_score_grid_cv = searcher_strat_grid.get_best_score()
strat_best_model_grid_cv = searcher_strat_grid.get_best_model()

strat_best_model_grid_cv.fit(train_set, train_labels)
strat_test_predictions_grid_cv = strat_best_model_grid_cv.predict(test_set)
strat_test_accuracy_grid_cv = accuracy_score(test_labels, strat_test_predictions_grid_cv)

joblib.dump(strat_best_model_grid_cv, MODELPATH + '08_SVM_strat_grid_cv.pkl')
save_model_as_sklearn(filename='08_SVM_strat_grid_cv_sklearn.pkl', parameters=strat_best_params_grid_cv)

Starting Grid Search CV...
Processing parameter combination 1/180...
Average CV score: 0.7508
Processing parameter combination 2/180...
Average CV score: 0.7508
Processing parameter combination 3/180...
Average CV score: 0.7508
Processing parameter combination 4/180...
Average CV score: 0.7508
Processing parameter combination 5/180...
Average CV score: 0.7508
Processing parameter combination 6/180...
Average CV score: 0.7508
Processing parameter combination 7/180...
Average CV score: 0.7508
Processing parameter combination 8/180...
Average CV score: 0.7508
Processing parameter combination 9/180...
Average CV score: 0.7508
Processing parameter combination 10/180...
Average CV score: 0.7508
Processing parameter combination 11/180...
Average CV score: 0.7508
Processing parameter combination 12/180...
Average CV score: 0.7508
Processing parameter combination 13/180...
Average CV score: 0.7508
Processing parameter combination 14/180...
Average CV score: 0.7508
Processing parameter combinati

In [19]:
model_data_print(strat_best_params_grid_cv, strat_best_score_grid_cv, strat_test_accuracy_grid_cv, time_end_strat_grid, time_start_strat_grid)

In [20]:
start_time_loo_grid = time.time()
searcher_loocv_grid.fit(train_set, train_labels)
end_time_loo_grid = time.time()

loocv_best_params_grid_cv = searcher_loocv_grid.get_best_params()
loocv_best_score_grid_cv = searcher_loocv_grid.get_best_score()
loocv_best_model_grid_cv = searcher_loocv_grid.get_best_model()

loocv_best_model_grid_cv.fit(train_set, train_labels)
loocv_test_predictions_grid_cv = loocv_best_model_grid_cv.predict(test_set)
loocv_test_accuracy_grid_cv = accuracy_score(test_labels, loocv_test_predictions_grid_cv)

joblib.dump(loocv_best_model_grid_cv, MODELPATH + '09_SVM_loocv_grid_cv.pkl')
save_model_as_sklearn(filename='09_SVM_loocv_grid_cv_sklearn.pkl', parameters=loocv_best_params_grid_cv)

Starting Grid Search CV...
Processing parameter combination 1/180: {'C': 0.8, 'class_weight': 'balanced', 'kernel': 'linear', 'max_iter': np.float64(2500.0), 'tol': 0.01}
 Training and evaluating sample 1/614...
 Training and evaluating sample 2/614...
 Training and evaluating sample 3/614...
 Training and evaluating sample 4/614...
 Training and evaluating sample 5/614...
 Training and evaluating sample 6/614...
 Training and evaluating sample 7/614...
 Training and evaluating sample 8/614...
 Training and evaluating sample 9/614...
 Training and evaluating sample 10/614...
 Training and evaluating sample 11/614...
 Training and evaluating sample 12/614...
 Training and evaluating sample 13/614...
 Training and evaluating sample 14/614...
 Training and evaluating sample 15/614...
 Training and evaluating sample 16/614...
 Training and evaluating sample 17/614...
 Training and evaluating sample 18/614...
 Training and evaluating sample 19/614...
 Training and evaluating sample 20/614..

In [21]:
model_data_print(loocv_best_params_grid_cv, loocv_best_score_grid_cv, loocv_test_accuracy_grid_cv, end_time_loo_grid, start_time_loo_grid)