Exercise: Try a Support Vector Machine regressor (sklearn.svm.SVR) with various hyperparameters, such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Note that SVMs don't scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross-validation, or else it will take hours. Don't worry about what the hyperparameters mean for now (see the SVM notebook if you're interested). How does the best SVR predictor perform?

### LIBRARIES

In [1]:
import os
import sys
sys.path.append(os.path.abspath('.'))  # Dodanie bieżącego katalogu
sys.path.append(os.path.abspath('..'))
from Functions import load_housing_data, shuffle_and_split, split_data_with_id_hash
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV
housing = load_housing_data()
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer
from Classes import StandardScalerClone, ClusterSimilarity, FeatureFromRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from scipy.stats import expon, loguniform
from sklearn.svm import SVR



### DATA DOWNLOAD AND SPLIT
Use transformation from chapter_2_notebook, in this exercies i only want to check how SVM perfrom, without going into details.

In [2]:
housing = load_housing_data()
housing['income_cat'] = pd.cut(housing['median_income'], 
                               bins = [0., 1.5, 3.0, 4.5, 6., np.inf ], 
                               labels = [1, 2, 3, 4, 5])

strat_train_set, strat_test_set = train_test_split(housing, test_size = 0.2, stratify = housing['income_cat'], random_state = 42)

for set_ in (strat_test_set, strat_train_set):
    set_.drop('income_cat', axis = 1, inplace = True)

housing = strat_train_set.drop('median_house_value', axis = 1)
housing_labels = strat_train_set['median_house_value'].copy()


In [3]:
def column_ratio(X):
    return X[ :, [0]] / X[ :, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ['ratio']

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy = 'median'),
        FunctionTransformer(column_ratio, feature_names_out = ratio_name),
        StandardScaler()
    )

log_pipeline = make_pipeline(
    SimpleImputer(strategy = 'median'),
    FunctionTransformer(np.log, feature_names_out = 'one-to-one'),
    StandardScaler()
)

cat_attribs = ['ocean_proximity']
cat_pipeline = make_pipeline(SimpleImputer(strategy = 'most_frequent'),
                             OneHotEncoder(handle_unknown = 'ignore'))

cluster_simil = ClusterSimilarity(n_clusters = 10, gamma = 1, random_state = 42)
default_num_pipeline = make_pipeline(SimpleImputer(strategy = 'median'))
preprocessing = ColumnTransformer([
    ('bedroom_ratio', ratio_pipeline(), ['total_bedrooms', 'total_rooms']), 
    ('rooms_per_family', ratio_pipeline(), ['total_rooms', 'households']), 
    ('people_per_house', ratio_pipeline(), ['population', 'households']),
    ('log', log_pipeline, ['total_bedrooms', 'total_rooms', 'population', 'households', 'median_income']), 
    ('geo', cluster_simil, ['latitude', 'longitude']), 
    ('cat', cat_pipeline, make_column_selector(dtype_include=object))
],
remainder=default_num_pipeline)




In [4]:

full_pipeline = Pipeline([
    ('preprocessing', preprocessing), 
    ('svr', SVR())
])

grid_params = [{'svr__kernel' : ['linear', 'rbf']},
               {'svr__C' : [-1, 0, 1]}, 
               {'svr__gamma' : [0,1,10]}]

grid_search = GridSearchCV(estimator = full_pipeline,
                           param_grid = grid_params, 
                          cv = 3, 
                          scoring = 'neg_root_mean_squared_error' )
grid_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

6 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\macie\OneDrive\Desktop\Nauka2\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\macie\OneDrive\Desktop\Nauka2\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\macie\OneDrive\Desktop\Nauka2\.venv\Lib\site-packages\sklearn\pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    ~~~~~~~~~

In [5]:
svr_grid_search_rmse = -grid_search.best_score_
svr_grid_search_rmse

np.float64(117107.5149145932)

2. replacing the GridSearchCV with a RandomizedSearchCV.

In [6]:
params_dist = [{'svr__kernel' : ['linear', 'rbf']}, 
               {'svr__C' : loguniform(20, 2000)}, 
               {'svr__gamma' : expon(scale = 1)}]

rnd_search = RandomizedSearchCV(estimator = full_pipeline, 
                                param_distributions = params_dist, 
                                scoring = 'neg_root_mean_squared_error', 
                                cv = 3, 
                                n_iter = 10)
rnd_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])
svr_rnd_search_rmse = -rnd_search.best_score_
svr_rnd_search_rmse

np.float64(115587.68451066622)

3. Try adding a SelectFromModel transformer in the preparation pipeline to select only the most important attributes.

In [7]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

new_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('feature_selections', SelectFromModel(RandomForestRegressor(random_state = 1), threshold = .05)),
    ('svr', SVR())])

new_pipeline.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor(n_neighbors=3, weights="distance")
knn_transformer = FeatureFromRegressor(knn_reg)
geo_features = housing[["latitude", "longitude"]]
knn_transformer.fit_transform(geo_features, housing_labels)

