In [9]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [10]:
# Load data
housing = pd.read_csv("datasets/housing/housing.csv")
# housing.head(5)
housing_labels = housing["median_house_value"]
housing = housing.drop("median_house_value", axis=1)

In [11]:
# Split and train data
X_train, X_test, y_train, y_test = train_test_split(housing, housing_labels, test_size=0.2, random_state=42)

# Use only first 5000 training instances
X_train_small = X_train[:5000]
y_train_small = y_train[:5000]

In [12]:
# Preprocessing
num_features = list(X_train_small.drop("ocean_proximity", axis=1).columns)
cat_features = ["ocean_proximity"]

preprocess = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_features),

    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder())
    ]), cat_features)
])

# Build pipeline
svr_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("svr", SVR())
])

In [13]:
# Parameter distribution
param_dist = {
    "svr__kernel": ["linear", "rbf"],
    "svr__C": np.logspace(-1, 2, 10),          # 0.1 â†’ 100
    "svr__gamma": ["scale", 0.01, 0.05, 0.1]  # only used for RBF
}

In [15]:
# Random search cv
random_search = RandomizedSearchCV(
    estimator=svr_pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring="neg_mean_squared_error",
    verbose=2,
    n_jobs=-1,
    random_state=42
)

In [16]:
# Fit on training subset
print("Running RandomizedSearchCV...")
random_search.fit(X_train_small, y_train_small)

print("\nBest hyperparameters:")
print(random_search.best_params_)

best_mse = -random_search.best_score_
best_rmse = np.sqrt(best_mse)
print("\nBest CV RMSE: {:.2f}".format(best_rmse))

Running RandomizedSearchCV...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

Best hyperparameters:
{'svr__kernel': 'linear', 'svr__gamma': 0.1, 'svr__C': np.float64(46.41588833612777)}

Best CV RMSE: 82039.22
