## Imports

In [14]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.model_selection import train_test_split
import itertools

## Utils

In [15]:
def generate_hyperparameter_combinations(param_ranges):
    """
    Generate all combinations of hyperparameters based on specified ranges and steps.

    Parameters
    ---------
    - param_ranges: Dictionary with hyperparameter names as keys.
                         Each value is a tuple (start, stop, step).

    Returns
    ---------
    - List of dictionaries with all possible combinations.
    """
    # Create a dictionary where each key is a hyperparameter name and each value is an array of possible values
    param_values = {
        key: np.arange(start, stop + step, step)  # Generate values from start to stop with the given step
        for key, (start, stop, step) in param_ranges.items()
    }
    
    # Generate all possible combinations of hyperparameter values
    param_combinations = list(itertools.product(*param_values.values()))
    
    # Convert each combination from a tuple to a dictionary
    return [
        dict(zip(param_values.keys(), combination))  # Create a dictionary for each combination
        for combination in param_combinations
    ]

## Data load

In [16]:
# One-hot encoding
encoder = OneHotEncoder(categories='auto', sparse_output=False)

# Load the training and test files for each dataset from the specified path
monk1_train = pd.read_csv('../Datasets/Monks/monks-1.train', sep='\s+', header=None)
monk1_test = pd.read_csv('../Datasets/Monks/monks-1.test', sep='\s+', header=None)

monk2_train = pd.read_csv('../Datasets/Monks/monks-2.train', sep='\s+', header=None)
monk2_test = pd.read_csv('../Datasets/Monks/monks-2.test', sep='\s+', header=None)

monk3_train = pd.read_csv('../Datasets/Monks/monks-3.train', sep='\s+', header=None)
monk3_test = pd.read_csv('../Datasets/Monks/monks-3.test', sep='\s+', header=None)

# List to store the transformed datasets
monks_train = []
monks_test = []

# Dataset monk1
X1_train = monk1_train.iloc[:, 1:7].values  # Features
y1_train = monk1_train.iloc[:, 0].values    # Labels

X1_test = monk1_test.iloc[:, 1:7].values
y1_test = monk1_test.iloc[:, 0].values

# Apply the encoder to monk1
X1_train_encoded = encoder.fit_transform(X1_train)  # Fit and transform on training data
X1_test_encoded = encoder.transform(X1_test)        # Only transform on test data

# Append the encoded data and labels to the list
monks_train.append((X1_train_encoded, y1_train))
monks_test.append((X1_test_encoded, y1_test))

# Dataset monk2
X2_train = monk2_train.iloc[:, 1:7].values  # Features
y2_train = monk2_train.iloc[:, 0].values    # Labels

X2_test = monk2_test.iloc[:, 1:7].values
y2_test = monk2_test.iloc[:, 0].values

# Apply the encoder to monk2
X2_train_encoded = encoder.fit_transform(X2_train)  # Fit and transform on training data
X2_test_encoded = encoder.transform(X2_test)        # Only transform on test data

# Append the encoded data and labels to the list
monks_train.append((X2_train_encoded, y2_train))
monks_test.append((X2_test_encoded, y2_test))

# Dataset monk3
X3_train = monk3_train.iloc[:, 1:7].values  # Features
y3_train = monk3_train.iloc[:, 0].values    # Labels

X3_test = monk3_test.iloc[:, 1:7].values
y3_test = monk3_test.iloc[:, 0].values

# Apply the encoder to monk3
X3_train_encoded = encoder.fit_transform(X3_train)  # Fit and transform on training data
X3_test_encoded = encoder.transform(X3_test)        # Only transform on test data

# Append the encoded data and labels to the list
monks_train.append((X3_train_encoded, y3_train))
monks_test.append((X3_test_encoded, y3_test))

## Model creation

In [17]:
def create_SVM(C=100, type='rbf'):
    '''
    Create an SVM model with the specified kernel type and regularization parameter C.
    
    Parameters:
    -----------
    C : float, default=100
        Regularization parameter. The strength of the regularization is inversely proportional to C.
        Must be strictly positive.
        
    type : str, default='rbf'
        Specifies the kernel type to be used in the algorithm. It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable.
    
    Returns:
    --------
    model : SVC object
        The SVM model with the specified kernel type and regularization parameter.
    '''
    
    # Create and return the SVM model with the specified kernel type and regularization parameter
    return SVC(kernel=type, C=C, random_state=42)

## K-fold cross validation

In [18]:
def k_fold_cross_validation(data, labels, type='rbf', params=None):
    '''
    Perform k-fold cross-validation for SVM.
    
    Parameters:
    -----------
    data : array-like
        Feature data.
    labels : array-like
        Target labels.
    type : str, default='rbf'
        Kernel type for SVM.
    params : dict, optional
        Dictionary of hyperparameters.
    
    Returns:
    --------
    avg_score : float
        Average accuracy score across all folds.
    model : SVC object
        Trained SVM model on the entire dataset.
    '''
    
    # 3. Configure k-fold cross-validation
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    # 4. Cross-validation loop
    fold_no = 1
    accuracy_per_fold = []  # List to store accuracy for each fold
    for train_index, val_index in kfold.split(data, labels):
        
        # Split the dataset into training and validation sets
        X_train, X_val = data[train_index], data[val_index]
        y_train, y_val = labels[train_index], labels[val_index]

        # Create the SVM model with specified hyperparameters
        model = create_SVM(C=params['C'], type=type)

        # Train the model on the training set
        model.fit(X_train, y_train)

        # Predict on the validation set
        pred = model.predict(X_val)
        
        # Get the accuracy score for the current fold
        score = accuracy_score(pred, y_val)
        accuracy_per_fold.append(score)  # Append the score to the list
        fold_no += 1

    # Calculate the average accuracy score across all folds
    avg_score = np.mean(accuracy_per_fold)

    # Split the dataset for final training (80% training, 20% validation)
    _, X_val, _, y_val = train_test_split(data, labels, test_size=0.2, random_state=42)

    # Create the SVM model again with the same hyperparameters
    model = create_SVM(C=params['C'], type=type)

    # Train the model on the entire dataset
    model.fit(data, labels)

    return avg_score, model

## Greed search

In [19]:
def greed_search(data, labels, type='rbf', param_grid=None):
    '''
    Perform greedy search for hyperparameter tuning.
    
    Parameters:
    -----------
    data : array-like
        Feature data.
    labels : array-like
        Target labels.
    type : str, default='rbf'
        Kernel type for SVM.
    param_grid : list of dict, optional
        List of hyperparameter combinations.
    
    Returns:
    --------
    best_scores : list of float
        Best scores obtained during the search.
    best_params_list : list of dict
        Best parameter configurations.
    best_models : list of SVC object
        Best models trained with the best parameter configurations.
    '''
    
    best_scores = []  # List to store the scores
    best_params_list = []  # List to store the parameter configurations
    best_models = []  # List to store the models
    
    # Iterate over each combination of hyperparameters in the parameter grid
    for params in param_grid:
        # Perform k-fold cross-validation with the current set of hyperparameters
        score, model = k_fold_cross_validation(data, labels, type, params=params)

        # Add the results to the respective lists
        best_scores.append(score)
        best_params_list.append(params)
        best_models.append(model)

        # Sort the list of scores in descending order and keep only the top 5
        sorted_indices = np.argsort(best_scores)[::-1]  # Get the indices that would sort the scores in descending order
        best_scores = [best_scores[i] for i in sorted_indices][:5]  # Keep the top 5 scores
        best_params_list = [best_params_list[i] for i in sorted_indices][:5]  # Keep the top 5 parameter configurations
        best_models = [best_models[i] for i in sorted_indices][:5]  # Keep the top 5 models

    return best_scores, best_params_list, best_models

## Model selection

In [20]:
def selection(data, labels):
    '''
    Perform hyperparameter selection for SVM models.
    
    Parameters:
    -----------
    data : array-like
        Feature data.
    labels : array-like
        Target labels.
    
    Returns:
    --------
    best_scores : list of float
        Best scores obtained during the search.
    best_params_list : list of dict
        Best parameter configurations.
    best_models : list of SVC object
        Best models trained with the best parameter configurations.
    '''

    # Define the range of hyperparameters
    param_ranges = {
        "C": (1, 1000, 10),  # From 1 to 1000 with step of 10
    }

    print("Generating hyperparameter combinations...")
    # Generate all combinations of hyperparameters based on the specified ranges
    param_grid = generate_hyperparameter_combinations(param_ranges)

    best_scores = []  # List to store the scores
    best_params_list = []  # List to store the parameter configurations
    best_models = []  # List to store the models
    best_types = []  # List to store the kernel types

    # Define the kernel types to be tested
    types = ['linear', 'poly', 'rbf', 'sigmoid']
    for type in types:
        # Perform greedy search for each kernel type
        actual_scores, actual_params_list, actual_models = greed_search(data, labels, type, param_grid)

        # Extend the lists with the results from the current kernel type
        best_scores.extend(actual_scores)
        best_params_list.extend(actual_params_list)
        best_models.extend(actual_models)
        best_types.extend([type] * len(actual_scores))

        # Sort the scores in descending order and keep only the top 5
        sorted_indices = np.argsort(best_scores)[::-1]
        best_scores = [best_scores[i] for i in sorted_indices][:5]
        best_params_list = [best_params_list[i] for i in sorted_indices][:5]
        best_models = [best_models[i] for i in sorted_indices][:5]
        best_types = [best_types[i] for i in sorted_indices][:5]

    # Print the best scores, kernel types, and parameter configurations
    for i in range(len(best_scores)):
        print(f"Score: {best_scores[i]}, type: {best_types[i]}, parameters: {best_params_list[i]}")

    return best_scores, best_params_list, best_models


In [21]:
print("-----------------MONK1-----------------")
# Perform hyperparameter selection for the MONK1 dataset and get the best models
_, _, best_models_monk_1 = selection(monks_train[0][0], monks_train[0][1])

print("-----------------MONK2-----------------")
# Perform hyperparameter selection for the MONK2 dataset and get the best models
_, _, best_models_monk_2 = selection(monks_train[1][0], monks_train[1][1])

print("-----------------MONK3-----------------")
# Perform hyperparameter selection for the MONK3 dataset and get the best models
_, _, best_models_monk_3 = selection(monks_train[2][0], monks_train[2][1])

-----------------MONK1-----------------
Generating hyperparameter combinations...
Score: 0.9839743589743589, type: poly, parameters: {'C': 31}
Score: 0.9839743589743589, type: poly, parameters: {'C': 11}
Score: 0.9839743589743589, type: poly, parameters: {'C': 1}
Score: 0.9839743589743589, type: poly, parameters: {'C': 21}
Score: 0.9839743589743589, type: poly, parameters: {'C': 1001}
-----------------MONK2-----------------
Generating hyperparameter combinations...
Score: 0.6753676470588237, type: poly, parameters: {'C': 31}
Score: 0.6753676470588237, type: poly, parameters: {'C': 11}
Score: 0.6753676470588237, type: poly, parameters: {'C': 21}
Score: 0.6753676470588237, type: poly, parameters: {'C': 41}
Score: 0.6753676470588237, type: poly, parameters: {'C': 1001}
-----------------MONK3-----------------
Generating hyperparameter combinations...
Score: 0.9333333333333332, type: linear, parameters: {'C': 21}
Score: 0.9333333333333332, type: linear, parameters: {'C': 1}
Score: 0.9333333

## Model assessment

In [22]:
# Model evaluation
# Use the best model for MONK1 to predict the labels of the test set
y1_pred = best_models_monk_1[0].predict(X1_test_encoded)

# Report the results
# Print the accuracy of the model on the test set
print("Accuracy:", accuracy_score(y1_test, y1_pred))

# Print the detailed classification report, which includes precision, recall, f1-score, and support for each class
print("\nClassification Report:\n", classification_report(y1_test, y1_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       216
           1       1.00      1.00      1.00       216

    accuracy                           1.00       432
   macro avg       1.00      1.00      1.00       432
weighted avg       1.00      1.00      1.00       432



In [23]:
# Model evaluation
# Use the best model for MONK2 to predict the labels of the test set
y2_pred = best_models_monk_2[0].predict(X2_test_encoded)

# Report the results
# Print the accuracy of the model on the test set
print("Accuracy:", accuracy_score(y2_test, y2_pred))

# Print the detailed classification report, which includes precision, recall, f1-score, and support for each class
print("\nClassification Report:\n", classification_report(y2_test, y2_pred))

Accuracy: 0.7731481481481481

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.77      0.82       290
           1       0.62      0.77      0.69       142

    accuracy                           0.77       432
   macro avg       0.75      0.77      0.76       432
weighted avg       0.79      0.77      0.78       432



In [24]:
# Predict on the test set for MONK3 using the best model obtained from hyperparameter selection
y3_pred = best_models_monk_3[0].predict(X3_test_encoded)

# Report the results
# Print the accuracy of the model on the test set
print("Accuracy:", accuracy_score(y3_test, y3_pred))

# Print the detailed classification report, which includes precision, recall, f1-score, and support for each class
print("\nClassification Report:\n", classification_report(y3_test, y3_pred))

Accuracy: 0.9722222222222222

Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       204
           1       1.00      0.95      0.97       228

    accuracy                           0.97       432
   macro avg       0.97      0.97      0.97       432
weighted avg       0.97      0.97      0.97       432

