In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import r2_score, make_scorer, mean_absolute_percentage_error
from scipy.stats import loguniform
from joblib import dump

# Expand display options
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_colwidth", None)  # Do not truncate column text
pd.set_option("display.expand_frame_repr", False)  # Avoid line wrapping

In [2]:
df_OHE = pd.read_csv('../DataSet/RegressionData/healthinsurance_OHE.csv')

In [3]:
from joblib import parallel_backend

# Define features & target
X = df_OHE.drop('claim', axis=1)
y = df_OHE['claim']

# Define MAPE as a scoring metric
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define SVR pipeline with scaling (SVR requires feature scaling)
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features for SVR
    ('svr', SVR())  # Support Vector Regression
])

# Define hyperparameter grid for SVR
param_dist = {
    'svr__C': loguniform(1e-3, 1e3),           # Regularization parameter
    'svr__epsilon': loguniform(1e-4, 1e1),       # Tolerance margin for loss
    'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel types
    'svr__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]     # Kernel coefficient
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=100,  # Number of parameter combinations to try
    cv=5,        # 5-fold cross-validation
    n_jobs=-1,   # Use all available cores
    random_state=42,  # Ensure reproducibility
    verbose=3,   # Show intermediate progress
    scoring=mape_scorer,  # Optimize for MAPE
)

# Use the threading backend to potentially allow verbose output to print
with parallel_backend('threading'):
    random_search.fit(X_train, y_train)

# Predictions on training set
y_train_pred = random_search.best_estimator_.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred) * 100  # Convert to %

# Predictions on test set
y_test_pred = random_search.best_estimator_.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred) * 100  # Convert to %

# Print results
print("Best parameters found:", random_search.best_params_)
print("Best cross-validation score (MAPE):", abs(random_search.best_score_))  # Convert to positive value
print(f"Train set R² score: {r2_train:.4f}")
print(f"Train set MAPE: {mape_train:.2f}%")
print(f"Test set R² score: {r2_test:.4f}")
print(f"Test set MAPE: {mape_test:.2f}%")


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 3/5] END svr__C=17.71884735480682, svr__epsilon=0.00012674255898937226, svr__gamma=auto, svr__kernel=sigmoid;, score=-0.858 total time= 2.7min
[CV 1/5] END svr__C=0.002231010801867922, svr__epsilon=2.1423021757741068, svr__gamma=0.1, svr__kernel=sigmoid;, score=-1.012 total time= 2.9min
[CV 2/5] END svr__C=17.71884735480682, svr__epsilon=0.00012674255898937226, svr__gamma=auto, svr__kernel=sigmoid;, score=-0.851 total time= 3.1min
[CV 2/5] END svr__C=0.002231010801867922, svr__epsilon=2.1423021757741068, svr__gamma=0.1, svr__kernel=sigmoid;, score=-1.000 total time= 3.2min
[CV 1/5] END svr__C=17.71884735480682, svr__epsilon=0.00012674255898937226, svr__gamma=auto, svr__kernel=sigmoid;, score=-0.866 total time= 3.3min
[CV 5/5] END svr__C=3.907967156822881, svr__epsilon=0.0006026889128682511, svr__gamma=0.01, svr__kernel=rbf;, score=-1.044 total time= 3.7min
[CV 3/5] END svr__C=0.002231010801867922, svr__epsilon=2.1423021

KeyboardInterrupt: 

In [None]:
# Get Top 10 parameters and scores
results = pd.DataFrame(random_search.cv_results_)
results = results.sort_values(by='rank_test_score')
results = results[['params', 'mean_test_score', 'std_test_score']]
print(results.head(1000))

                                                                                                                       params  mean_test_score  std_test_score
12           {'svr__C': 12.746711578215052, 'svr__epsilon': 0.015876781526923997, 'svr__gamma': 0.1, 'svr__kernel': 'linear'}        -0.177237        0.001981
68            {'svr__C': 30.12648273008458, 'svr__epsilon': 0.17707795393478512, 'svr__gamma': 0.01, 'svr__kernel': 'linear'}        -0.177892        0.002133
19         {'svr__C': 30.473382351691427, 'svr__epsilon': 0.04994139416350944, 'svr__gamma': 'auto', 'svr__kernel': 'linear'}        -0.177902        0.002116
54             {'svr__C': 71.94593762420305, 'svr__epsilon': 0.00553770652765281, 'svr__gamma': 0.1, 'svr__kernel': 'linear'}        -0.178377        0.002135
92             {'svr__C': 86.21907594035272, 'svr__epsilon': 0.00632343715999823, 'svr__gamma': 0.1, 'svr__kernel': 'linear'}        -0.178491        0.002171
4            {'svr__C': 98.77700294007911, 'sv