In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import r2_score, make_scorer, mean_absolute_percentage_error
from scipy.stats import loguniform
from joblib import dump

# Expand display options
pd.set_option("display.max_rows", None)  # Show all rows
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.max_colwidth", None)  # Do not truncate column text
pd.set_option("display.expand_frame_repr", False)  # Avoid line wrapping

In [2]:
df_OHE = pd.read_csv('../DataSet/RegressionData/healthinsurance_OHE.csv')

In [3]:
from joblib import parallel_backend

# Define features & target
X = df_OHE.drop('claim', axis=1)
y = df_OHE['claim']

# Define MAPE as a scoring metric
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define SVR pipeline with scaling (SVR requires feature scaling)
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features for SVR
    ('svr', SVR())  # Support Vector Regression
])

# Define hyperparameter grid for SVR
param_dist = {
    'svr__C': loguniform(1e-3, 1e3),           # Regularization parameter
    'svr__epsilon': loguniform(1e-4, 1e1),       # Tolerance margin for loss
    'svr__kernel': ['linear'],  # Kernel types
    'svr__gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]     # Kernel coefficient
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=20,  # Number of parameter combinations to try
    cv=5,        # 5-fold cross-validation
    n_jobs=18,   # Use all available cores
    random_state=42,  # Ensure reproducibility
    verbose=3,   # Show intermediate progress
    scoring=mape_scorer,  # Optimize for MAPE
)

# Use the threading backend to potentially allow verbose output to print
with parallel_backend('threading'):
    random_search.fit(X_train, y_train)

# Predictions on training set
y_train_pred = random_search.best_estimator_.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)
mape_train = mean_absolute_percentage_error(y_train, y_train_pred) * 100  # Convert to %

# Predictions on test set
y_test_pred = random_search.best_estimator_.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)
mape_test = mean_absolute_percentage_error(y_test, y_test_pred) * 100  # Convert to %

# Print results
print("Best parameters found:", random_search.best_params_)
print("Best cross-validation score (MAPE):", abs(random_search.best_score_))  # Convert to positive value
print(f"Train set R² score: {r2_train:.4f}")
print(f"Train set MAPE: {mape_train:.2f}%")
print(f"Test set R² score: {r2_test:.4f}")
print(f"Test set MAPE: {mape_test:.2f}%")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END svr__C=0.1767016940294795, svr__epsilon=5.669849511478847, svr__gamma=0.01, svr__kernel=linear;, score=-0.795 total time= 1.0min
[CV 3/5] END svr__C=0.1767016940294795, svr__epsilon=5.669849511478847, svr__gamma=0.01, svr__kernel=linear;, score=-0.790 total time= 1.4min
[CV 4/5] END svr__C=0.1767016940294795, svr__epsilon=5.669849511478847, svr__gamma=0.01, svr__kernel=linear;, score=-0.853 total time= 1.5min
[CV 2/5] END svr__C=0.1767016940294795, svr__epsilon=5.669849511478847, svr__gamma=0.01, svr__kernel=linear;, score=-0.786 total time= 1.8min
[CV 5/5] END svr__C=0.1767016940294795, svr__epsilon=5.669849511478847, svr__gamma=0.01, svr__kernel=linear;, score=-0.833 total time= 1.9min
[CV 1/5] END svr__C=0.008629132190071854, svr__epsilon=0.00019517224641449495, svr__gamma=1, svr__kernel=linear;, score=-1.003 total time= 2.2min
[CV 3/5] END svr__C=0.008629132190071854, svr__epsilon=0.00019517224641449495, svr

In [6]:
# Get Top 10 parameters and scores
results = pd.DataFrame(random_search.cv_results_)
results = results.sort_values(by='rank_test_score')
results = results[['params', 'mean_test_score', 'std_test_score']]
print(results.head(10))

                                                                                                                  params  mean_test_score  std_test_score
6             {'svr__C': 5.068612120448909, 'svr__epsilon': 0.114357802784334, 'svr__gamma': 1, 'svr__kernel': 'linear'}        -0.177067        0.002022
3          {'svr__C': 4.0428727350273315, 'svr__epsilon': 0.3470266988650412, 'svr__gamma': 10, 'svr__kernel': 'linear'}        -0.178074        0.002244
1       {'svr__C': 47.6591180868084, 'svr__epsilon': 0.09643857615941427, 'svr__gamma': 'auto', 'svr__kernel': 'linear'}        -0.178232        0.002149
16   {'svr__C': 99.80894623678049, 'svr__epsilon': 0.000735907565201939, 'svr__gamma': 'scale', 'svr__kernel': 'linear'}        -0.178562        0.002211
13     {'svr__C': 622.0025976819159, 'svr__epsilon': 1.1015056790269626, 'svr__gamma': 'scale', 'svr__kernel': 'linear'}        -0.178814        0.002258
12  {'svr__C': 0.50465608220732, 'svr__epsilon': 0.00011649969967638916, 'sv