In [4]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.over_sampling import SMOTE

# Load the data
# PUT THE CORRECT FILE PATH. WE PUT THIS ONE FOR KAGGLE
X_train = np.load('/kaggle/input/ftml-classification2/X_train.npy')
X_test = np.load('/kaggle/input/ftml-classification2/X_test.npy')
y_train = np.load('/kaggle/input/ftml-classification2/y_train.npy').ravel()
y_test = np.load('/kaggle/input/ftml-classification2/y_test.npy').ravel()

# Inspect the data
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Preprocess the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

from collections import Counter

# Check class distribution
print(f'Class distribution in y_train: {Counter(y_train)}')

X_train shape: (500, 30)
X_test shape: (500, 30)
y_train shape: (500,)
y_test shape: (500,)
Class distribution in y_train: Counter({1: 259, 0: 241})


### Hyperparameter Optimization for SVM with Polynomial Kernel using Optuna

Optuna optimizes SVM hyperparameters (`C`, `degree`, `gamma`) using cross-validation to enhance model performance. This ensures the best combination is found, maximizing accuracy through an extensive hyperparameter search.

In [6]:
import optuna
from sklearn.model_selection import StratifiedKFold

# Objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    C = trial.suggest_float('C', 5, 15, step=0.1)
    degree = trial.suggest_int('degree', 2, 4)
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
    
    # Initialize the SVM model
    svc_model = SVC(kernel='poly', degree=degree, gamma=gamma, C=C)
    
    # Perform cross-validation
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(svc_model, X_train_scaled, y_train, cv=cv, scoring='accuracy')
    
    return scores.mean()

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000)  # Increase the number of trials for a wider search

# Print the best parameters
best_params = study.best_params
print(f'Best parameters: {best_params}')

# Train the best model on the entire training set and evaluate on the test set
best_svc_model = SVC(kernel='poly', degree=best_params['degree'], gamma=best_params['gamma'], C=best_params['C'])
best_svc_model.fit(X_train_scaled, y_train)
y_pred = best_svc_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {test_accuracy:.4f}')


[I 2024-06-30 20:03:16,265] A new study created in memory with name: no-name-c36bcb37-3d93-4440-9e10-6e2bb5ad3d78
[I 2024-06-30 20:03:16,352] Trial 0 finished with value: 0.598 and parameters: {'C': 13.700000000000001, 'degree': 2, 'gamma': 'scale'}. Best is trial 0 with value: 0.598.
[I 2024-06-30 20:03:16,411] Trial 1 finished with value: 0.5880000000000001 and parameters: {'C': 14.9, 'degree': 4, 'gamma': 'scale'}. Best is trial 0 with value: 0.598.
[I 2024-06-30 20:03:16,471] Trial 2 finished with value: 0.58 and parameters: {'C': 10.3, 'degree': 4, 'gamma': 'scale'}. Best is trial 0 with value: 0.598.
[I 2024-06-30 20:03:16,551] Trial 3 finished with value: 0.598 and parameters: {'C': 14.3, 'degree': 2, 'gamma': 'auto'}. Best is trial 0 with value: 0.598.
[I 2024-06-30 20:03:16,629] Trial 4 finished with value: 0.598 and parameters: {'C': 12.9, 'degree': 2, 'gamma': 'scale'}. Best is trial 0 with value: 0.598.
[I 2024-06-30 20:03:16,687] Trial 5 finished with value: 0.728 and para

Best parameters: {'C': 10.5, 'degree': 3, 'gamma': 'auto'}
Test set accuracy: 0.8560


SVC model with Optuna optimized C, degree, and gamma using Stratified KFold, achieving a test set accuracy of 0.8560

### Hyperparameter Optimization for Random Forest using Optuna

Optuna optimizes Random Forest hyperparameters (`n_estimators`, `max_depth`, `min_samples_split`, `min_samples_leaf`) via cross-validation. This comprehensive search enhances model reliability and maximizes accuracy, validated on the test set.

In [11]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Objective function for Random Forest
def objective_rf(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    
    # Initialize the Random Forest model
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    
    # Perform cross-validation
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=cv, scoring='accuracy')
    
    return scores.mean()

# Create a study object and optimize the objective function
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=200)  # Increase the number of trials for a wider search

# Print the best parameters
best_params_rf = study_rf.best_params
print(f'Best parameters for Random Forest: {best_params_rf}')

# Train the best model on the entire training set and evaluate on the test set
best_rf_model = RandomForestClassifier(
    n_estimators=best_params_rf['n_estimators'], 
    max_depth=best_params_rf['max_depth'], 
    min_samples_split=best_params_rf['min_samples_split'],
    min_samples_leaf=best_params_rf['min_samples_leaf'],
    random_state=42
)
best_rf_model.fit(X_train_scaled, y_train)
y_pred_rf = best_rf_model.predict(X_test_scaled)
test_accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Test set accuracy for Random Forest: {test_accuracy_rf:.4f}')


[I 2024-06-30 20:08:25,425] A new study created in memory with name: no-name-4e63525d-be45-4c8b-97d6-522b4aa05ec6
[I 2024-06-30 20:08:28,735] Trial 0 finished with value: 0.756 and parameters: {'n_estimators': 248, 'max_depth': 17, 'min_samples_split': 4, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.756.
[I 2024-06-30 20:08:32,459] Trial 1 finished with value: 0.762 and parameters: {'n_estimators': 271, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.762.
[I 2024-06-30 20:08:35,159] Trial 2 finished with value: 0.756 and parameters: {'n_estimators': 213, 'max_depth': 15, 'min_samples_split': 8, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.762.
[I 2024-06-30 20:08:39,005] Trial 3 finished with value: 0.768 and parameters: {'n_estimators': 290, 'max_depth': 7, 'min_samples_split': 10, 'min_samples_leaf': 7}. Best is trial 3 with value: 0.768.
[I 2024-06-30 20:08:39,977] Trial 4 finished with value: 0.76 and parameters: {'n_e

Best parameters for Random Forest: {'n_estimators': 173, 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 8}
Test set accuracy for Random Forest: 0.8020


Random Forest with Optuna tuned n_estimators, max_depth, min_samples_split, and min_samples_leaf with Stratified KFold, achieving a test set accuracy of 0.8020.

The optimized SVC model outperformed the Random Forest in test accuracy.