In [None]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [64]:
# Load data
housing = pd.read_csv("datasets/housing/housing.csv")
# housing.head(5)
housing_labels = housing["median_house_value"]
housing = housing.drop("median_house_value", axis=1)

In [65]:
# Split and train data
X_train, X_test, y_train, y_test = train_test_split(housing, housing_labels, test_size=0.2, random_state=42)

# Use only first 5000 training instances
X_train_small = X_train[:5000]
y_train_small = y_train[:5000]

In [66]:
# Preprocessing
num_features = list(X_train_small.drop("ocean_proximity", axis=1).columns)
cat_features = ["ocean_proximity"]

preprocess = ColumnTransformer([
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_features),

    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder())
    ]), cat_features)
])

# Build pipeline
svr_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("svr", SVR())
])

In [67]:
param_grid = [
    {
        "svr__kernel": ["linear"],
        "svr__C": [0.1, 1, 5, 10, 20]
    },
    {
        "svr__kernel": ["rbf"],
        "svr__C": [1, 5, 10, 20],
        "svr__gamma": ["scale", 0.01, 0.05, 0.1]
    }
]


In [70]:
# Grid search
grid_search = GridSearchCV(
    svr_pipeline,
    param_grid,
    cv=3,
    scoring="neg_mean_squared_error",
    verbose=2,
    n_jobs=-1
)

In [69]:
print("training GridSearchCV...")
grid_search.fit(X_train_small, y_train_small)

print("\nBest params: ")
print(grid_search.best_params_)

print("\nBest score: ")
print(grid_search.best_score_)

training GridSearchCV...
Fitting 3 folds for each of 21 candidates, totalling 63 fits

Best params: 
{'svr__C': 20, 'svr__kernel': 'linear'}

Best score: 
-9090487423.621984
