Exercise

1. Try a Support Vector Machine regressor (sklearn.svm.SVR) with various hyperparameters, such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Don’t worry about what these hyperparameters mean for now. How does the best SVR predictor perform?

In [1]:
# get data
import pandas as pd
import numpy as np
housing_prepared = np.loadtxt("data/transit/housing_prepared.csv",delimiter=',')
housing_labels = np.loadtxt('data/transit/housing_labels.csv')
strat_test_set = pd.read_csv("data/transit/strat_test_set.csv")

In [5]:
attributes = ['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'rooms_per_hhold',
 'pop_per_hhold',
 'bedrooms_per_room',
 '<1H OCEAN',
 'INLAND',
 'ISLAND',
 'NEAR BAY',
 'NEAR OCEAN']

In [7]:
from sklearn.svm import SVR
# Support vector regressor.
sv_reg = SVR()
sv_reg.fit(housing_prepared, housing_labels)

In [11]:
predictions = sv_reg.predict(housing_prepared)

In [12]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(housing_labels, predictions)
rmse = np.sqrt(mse)

In [13]:
rmse

118578.69234925653

In [16]:
from sklearn.model_selection import cross_val_score

sv_reg = SVR()

cv = cross_val_score(sv_reg, housing_prepared, housing_labels, cv=5, scoring="neg_mean_squared_error")

In [17]:
cv_rmse = np.sqrt(-cv)
print('mean rmse: ',cv_rmse.mean())
print('st deviation: ', cv_rmse.std())

# Not overfitting.

mean rmse:  118635.31486436282
st deviation:  855.860662189781


2. Try replacing GridSearchCV with RandomizedSearchCV

In [20]:
from sklearn.model_selection import RandomizedSearchCV

sv_reg = SVR()

search_cv = RandomizedSearchCV(estimator=sv_reg, param_distributions=[
    {'kernel': ["linear"], 'C': [1e-2,1e-1,1,1e1,1e2]},
    {'kernel': ["rbf"], 'C': [1e-2,1e-1,1,1e1,1e2], 'gamma': ['scale', 'auto']},
], scoring='neg_mean_squared_error', n_iter=10, cv=10, random_state=42, return_train_score=True)

In [21]:
search_cv.fit(housing_prepared, housing_labels)

In [24]:
cv_res = search_cv.cv_results_

In [27]:
for mse_, params_ in zip(cv_res['mean_test_score'],cv_res['params']):
    print(np.sqrt(-mse_), params_)

118613.26647262045 {'kernel': 'rbf', 'gamma': 'scale', 'C': 1}
115791.48757272134 {'kernel': 'rbf', 'gamma': 'scale', 'C': 10.0}
118856.44726346298 {'kernel': 'linear', 'C': 0.01}
96509.54851755845 {'kernel': 'rbf', 'gamma': 'scale', 'C': 100.0}
118930.7694439634 {'kernel': 'rbf', 'gamma': 'scale', 'C': 0.01}
118903.78432962317 {'kernel': 'rbf', 'gamma': 'auto', 'C': 0.1}
111844.35112140469 {'kernel': 'linear', 'C': 1}
118175.49509148978 {'kernel': 'linear', 'C': 0.1}
96295.98617685701 {'kernel': 'rbf', 'gamma': 'auto', 'C': 100.0}
71481.43866536245 {'kernel': 'linear', 'C': 100.0}


In [29]:
# We try furthur:

sv_reg = SVR()

search_cv = RandomizedSearchCV(estimator=sv_reg, param_distributions=[
    {'kernel': ["linear"], 'C': [1.5e1,1e2,1e3]},
], scoring='neg_mean_squared_error', n_iter=3, cv=10, random_state=42, return_train_score=True)
search_cv.fit(housing_prepared, housing_labels)
cv_res = search_cv.cv_results_
for mse_, params_ in zip(cv_res['mean_test_score'],cv_res['params']):
    print(np.sqrt(-mse_), params_)

79544.55567911135 {'kernel': 'linear', 'C': 15.0}
71481.43866536245 {'kernel': 'linear', 'C': 100.0}
70433.91265538594 {'kernel': 'linear', 'C': 1000.0}


In [31]:
sv_reg = search_cv.best_estimator_
# import joblib
# joblib.dump(sv_reg, "data/models/sv_reg.gz", compress=7)

['data/models/sv_reg.gz']

3. Try adding a transformer in the preparation pipeline to select only the most important attributes.

In [None]:
from typing import List

# from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

class SelectImportantAttribs(BaseEstimator, TransformerMixin):
    def __init__(self, unimportant_attribs: List[int] | List[str]):
        self.attributes = ['longitude',
            'latitude',
            'housing_median_age',
            'total_rooms',
            'total_bedrooms',
            'population',
            'households',
            'median_income',
            'rooms_per_hhold',
            'pop_per_hhold',
            'bedrooms_per_room',
            '<1H OCEAN',
            'INLAND',
            'ISLAND',
            'NEAR BAY',
            'NEAR OCEAN']
        if type(unimportant_attribs[0]) is type('s'):
            self.unimportant_attribs = [attributes.index(attr_) for attr_ in unimportant_attribs]
        else:
            self.unimportant_attribs = unimportant_attribs
            
    def fit(self):
        return self
    
    def transform(self, X: np.ndarray):
        important_attribs = set(range(len(self.attributes))) - set(self.unimportant_attribs)
        self.attributes = list(important_attribs)
        self.attributes.sort()
        return X[:,self.attributes]

# prepare_pipeline = Pipeline([
#    ...,
#    ('select_important_attribs', SelectImportantAttribs())
# ])

# Then we can train again SVR using data with more important attribs.

4. Try creating a single pipeline that does the full data preparation plus the final prediction.

In [None]:
class predict_pipeline(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self):
        return self
    
    def transform(self, X):
        # predict
        return search_cv.best_estimator_.predict(X)

# full_prep_pred_pipeline = Pipeline([
#     ('prepare', prepare_pipeline()),
#     ('predict', predict_pipeline()),
# ])

5. Automatically explore some preparation options using GridSearchCV.

- Omitted.