## Imports

In [9]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, confusion_matrix
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import itertools



## Utils

In [10]:
def generate_hyperparameter_combinations(param_ranges):
    '''
    Parameters:
    param_ranges (dict): Dictionary with hyperparameter names as keys.
                         Each value is a tuple (start, stop, step) indicating the range and step size for the hyperparameter.
    Returns:
    list: List of dictionaries with all possible combinations of hyperparameters.
    '''
    param_values = {
        key: np.arange(start, stop + step, step)
        for key, (start, stop, step) in param_ranges.items()
    }
    
    param_combinations = list(itertools.product(*param_values.values()))
    return [
        dict(zip(param_values.keys(), combination))
        for combination in param_combinations
    ]

## Data load

In [11]:
# One-hot encoding
encoder = OneHotEncoder(categories='auto', sparse_output=False)

# Load training and test files for each dataset from the specified path
monk1_train = pd.read_csv('../Datasets/Monks/monks-1.train', sep='\s+', header=None)
monk1_test = pd.read_csv('../Datasets/Monks/monks-1.test', sep='\s+', header=None)

monk2_train = pd.read_csv('../Datasets/Monks/monks-2.train', sep='\s+', header=None)
monk2_test = pd.read_csv('../Datasets/Monks/monks-2.test', sep='\s+', header=None)

monk3_train = pd.read_csv('../Datasets/Monks/monks-3.train', sep='\s+', header=None)
monk3_test = pd.read_csv('../Datasets/Monks/monks-3.test', sep='\s+', header=None)

# List to store the transformed datasets
monks_train = []
monks_test = []

# Dataset monk1
X1_train = monk1_train.iloc[:, 1:7].values  # Features
y1_train = monk1_train.iloc[:, 0].values    # Labels

X1_test = monk1_test.iloc[:, 1:7].values
y1_test = monk1_test.iloc[:, 0].values

# Apply encoder to monk1
X1_train_encoded = encoder.fit_transform(X1_train)  # Fit and transform on training data
X1_test_encoded = encoder.transform(X1_test)        # Only transform on test data

monks_train.append((X1_train_encoded, y1_train))
monks_test.append((X1_test_encoded, y1_test))

# Dataset monk2
X2_train = monk2_train.iloc[:, 1:7].values
y2_train = monk2_train.iloc[:, 0].values

X2_test = monk2_test.iloc[:, 1:7].values
y2_test = monk2_test.iloc[:, 0].values

# Apply encoder to monk2
X2_train_encoded = encoder.fit_transform(X2_train)
X2_test_encoded = encoder.transform(X2_test)

monks_train.append((X2_train_encoded, y2_train))
monks_test.append((X2_test_encoded, y2_test))

# Dataset monk3
X3_train = monk3_train.iloc[:, 1:7].values
y3_train = monk3_train.iloc[:, 0].values

X3_test = monk3_test.iloc[:, 1:7].values
y3_test = monk3_test.iloc[:, 0].values

# Apply encoder to monk3
X3_train_encoded = encoder.fit_transform(X3_train)
X3_test_encoded = encoder.transform(X3_test)

monks_train.append((X3_train_encoded, y3_train))
monks_test.append((X3_test_encoded, y3_test))


## Model creation

In [12]:
def create_KNN(K=100):
    '''
    Create an K-NN model with the parameter K.
    param K: Regularization parameter.
    return: K-NN model.
    '''
    return KNeighborsClassifier(n_neighbors=K)

## K-fold cross validation

In [13]:
def k_fold_cross_validation(data, labels, params=None):
    '''
    Perform k-fold cross-validation for SVM.
    param data: Feature data.
    param labels: Target labels.
    param type: Kernel type for SVM.
    param params: Dictionary of hyperparameters.
    return: Average score and trained model.
    '''
    # 3. Configure k-fold cross-validation
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    print(kfold)
    print(data)
    print(labels)
    # 4. Cross-validation loop
    fold_no = 1
    accuracy_per_fold = []
    for train_index, val_index in kfold.split(data, labels):
        
        # Split the dataset
        X_train, X_val = data[train_index], data[val_index]
        y_train, y_val = labels[train_index], labels[val_index]

        # Create the SVM model
        model = create_KNN(K=params['K'])

        # Train the model
        model.fit(X_train, y_train)

        # Predict on validation set
        pred = model.predict(X_val)
        
        # Get the accuracy score
        score = accuracy_score(pred, y_val)
        accuracy_per_fold.append(score)    
        fold_no += 1

    # Calculate the average score
    avg_score = np.mean(accuracy_per_fold)

    # Split the dataset for final training
    _, X_val, _, y_val = train_test_split(data, labels, test_size=0.2, random_state=42)

    # Create the SVM model
    model = create_KNN(K=params['K'])

    # Train the model on the entire dataset
    model.fit(data, labels)

    return avg_score, model

## Greed search

In [14]:
def greed_search(data, labels, param_grid=None):
    '''
    Perform greedy search for hyperparameter tuning.
    param data: Feature data.
    param labels: Target labels.
    param type: Kernel type for SVM.
    param param_grid: List of hyperparameter combinations.
    return: Best scores, best parameter configurations, and best models.
    '''
    best_scores = []  # List to store the scores
    best_params_list = []  # List to store the parameter configurations
    best_models = []  # List to store the models
    
    for params in param_grid:
        # Perform k-fold cross-validation
        score, model = k_fold_cross_validation(data, labels, params=params)

        # Add the results to the list
        best_scores.append(score)
        best_params_list.append(params)
        best_models.append(model)

        # Sort the list of scores and keep only the top 5
        sorted_indices = np.argsort(best_scores)[::-1]  # Sort the scores in descending order
        best_scores = [best_scores[i] for i in sorted_indices][:5]  # Keep the top 5 scores
        best_params_list = [best_params_list[i] for i in sorted_indices][:5]
        best_models = [best_models[i] for i in sorted_indices][:5]

    return best_scores, best_params_list, best_models

## Model selection

In [15]:
def selection(data, labels):
    '''
    Perform hyperparameter selection for SVM models.
    param data: Feature data.
    param labels: Target labels.
    return: Best scores, best parameter configurations, and best models.
    '''

    # Define the range of hyperparameters
    param_ranges = {
        "K": (1, 1000, 10),  # From 1 to 1000 with step of 10
    }

    print("Generating hyperparameter combinations...")
    param_grid = generate_hyperparameter_combinations(param_ranges)

    best_scores = []  # List to store the scores
    best_params_list = []  # List to store the parameter configurations
    best_models = []  # List to store the models

    actual_scores, actual_params_list, actual_models = greed_search(data, labels, param_grid)

    # Extend the lists with the results from the current kernel type
    best_scores.extend(actual_scores)
    best_params_list.extend(actual_params_list)
    best_models.extend(actual_models)

    # Sort the scores in descending order and keep only the top 5
    sorted_indices = np.argsort(best_scores)[::-1]
    best_scores = [best_scores[i] for i in sorted_indices][:5]
    best_params_list = [best_params_list[i] for i in sorted_indices][:5]
    best_models = [best_models[i] for i in sorted_indices][:5]

    # Print the best scores, kernel types, and parameter configurations
    for i in range(len(best_scores)):
        print(f"Score: {best_scores[i]}, parameters: {best_params_list[i]}")

    return best_scores, best_params_list, best_models


In [16]:
"""
This script performs model selection for three different MONK datasets using the `selection` function.

The script prints headers for each MONK dataset and then calls the `selection` function with the training data for each dataset.
The results of the selection process are stored in variables `best_models_monk_1`, `best_models_monk_2`, and `best_models_monk_3`.

Variables:
    monks_train (list): A list containing training data for the three MONK datasets.
    best_models_monk_1 (object): The best models selected for the MONK1 dataset.
    best_models_monk_2 (object): The best models selected for the MONK2 dataset.
    best_models_monk_3 (object): The best models selected for the MONK3 dataset.

Functions:
    selection(X, y): A function that performs model selection given features `X` and labels `y`.

Usage:
    Run this script to perform model selection on the MONK datasets and obtain the best models for each dataset.
"""

print("-----------------MONK1-----------------")
_, _, best_models_monk_1 = selection(monks_train[0][0], monks_train[0][1])
print("-----------------MONK2-----------------")
_, _, best_models_monk_2 = selection(monks_train[1][0], monks_train[1][1])
print("-----------------MONK3-----------------")
_, _, best_models_monk_3 = selection(monks_train[2][0], monks_train[2][1])

-----------------MONK1-----------------
Generating hyperparameter combinations...
StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
[[1. 0. 0. ... 0. 1. 0.]
 [1. 0. 0. ... 0. 0. 1.]
 [1. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 1. ... 0. 0. 1.]
 [0. 0. 1. ... 0. 0. 1.]
 [0. 0. 1. ... 1. 0. 1.]]
[1 1 1 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1
 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 0 0 0 1 1 0 0 1 1 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 1 0 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]


AttributeError: 'NoneType' object has no attribute 'split'

## Model assessment

In [None]:
# 5. Model evaluation
y1_pred = best_models_monk_1[0].predict(X1_test_encoded)

# 6. Report the results
print("Accuracy:", accuracy_score(y1_test, y1_pred))
print("\nClassification Report:\n", classification_report(y1_test, y1_pred))

NameError: name 'best_models_monk_1' is not defined

In [330]:
# 5. Model evaluation
y2_pred = best_models_monk_2[0].predict(X2_test_encoded)

# 6. Report the results
print("Accuracy:", accuracy_score(y2_test, y2_pred))
print("\nClassification Report:\n", classification_report(y2_test, y2_pred))

Accuracy: 0.7731481481481481

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.77      0.82       290
           1       0.62      0.77      0.69       142

    accuracy                           0.77       432
   macro avg       0.75      0.77      0.76       432
weighted avg       0.79      0.77      0.78       432



In [331]:
# Predict on the test set for MONK3
y3_pred = best_models_monk_3[0].predict(X3_test_encoded)

# Report the results
print("Accuracy:", accuracy_score(y3_test, y3_pred))
print("\nClassification Report:\n", classification_report(y3_test, y3_pred))

Accuracy: 0.9722222222222222

Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       204
           1       1.00      0.95      0.97       228

    accuracy                           0.97       432
   macro avg       0.97      0.97      0.97       432
weighted avg       0.97      0.97      0.97       432

