# Exercises of chapter 2

The purpose of this notebook is to solve the exercises proposed in chapter 2

Exercicies is:

1. Try a Support Vector Machine regressor.
2. Try replacing the `GridSearchCV` with a `RandomizedSearchCV`.
3. Try adding a `SelectFromModel` transformer in the preparation pipeline to select only the most important attributes.
4. Try creating a custom transformer that trains a k-Nearest Neighbors regressor (`sklearn.neighbors.KNeighborsRegressor`) in its `fit()`.
5. Automatically explore some preparation options using `RandomSearchCV`.
6. Try to implement the `StandardScalerClone` class again from scratch, then add support for the `inverse_transform()` method.

## Common data

In [1]:
from utils.data import load_housing_data

housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)
strat_train_set, strat_test_set = train_test_split(
    housing, test_size=0.2, stratify=housing["income_cat"], random_state=42
)



In [3]:
housing = strat_train_set.drop('median_house_value', axis=1)
housing_labels = strat_train_set['median_house_value'].copy()

In [4]:
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel


class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        self.kmeans_: KMeans = None

    def fit(self, x, _=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(x, sample_weight=sample_weight)
        return self

    def transform(self, x):
        return rbf_kernel(x, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, _=None):
        return [
            f'Feature {i} similarity' for i in range(self.n_clusters)
        ]

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

cat_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

In [6]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer


def column_ratio(x: pd.DataFrame) -> pd.Series:
    return x[:, [0]] / x[:, [1]]


def ratio_name() -> list:
    return ['ratio']


def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy='median'),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler()
    )


log_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    FunctionTransformer(np.log, feature_names_out=ratio_name),
    StandardScaler()
)

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1, random_state=42)

default_num_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

preprocessing = ColumnTransformer(
    [
        ('bedrooms', ratio_pipeline(), ['total_bedrooms', 'total_rooms']),
        ('rooms_per_house', ratio_pipeline(), ['total_rooms', 'households']),
        ('people_per_house', ratio_pipeline(), ['population', 'households']),
        ('log', log_pipeline, [
            'total_bedrooms', 'total_rooms', 'population',
            'households', 'median_income'
        ]),
        ('geo', cluster_simil, ['latitude', 'longitude']),
        ('cat', cat_pipeline, make_column_selector(dtype_include=object))
    ],
    remainder=default_num_pipeline
)

## 1. Try SVR

Try a Support Vector Machine regressor (`sklearn.svm.SVR`) with various hyperparameters, such as `kernel="linear"` (with various values for the `C` hyperparameter) or `kernel="rbf"` (with various values for the `C` and `gamma` hyperparameters). Note that SVMs don't scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross-validation, or else it will take hours. Don't worry about what the hyperparameters mean for now (see the SVM notebook if you're interested). How does the best `SVR` predictor perform?

In [7]:
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'svr__kernel': ['linear'],
        'svr__C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.]
    },
    {
        'svr__kernel': ['rbf'],
        'svr__C': [1., 3., 10., 30., 100., 300., 1000.],
        'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1., 3.]
    }
]
svr_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('svr', SVR())
])
grid_search = GridSearchCV(
    svr_pipeline,
    param_grid,
    cv=3,
    scoring='neg_root_mean_squared_error'
)
grid_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [8]:
svr_grid_search_rmse = -grid_search.best_score_
svr_grid_search_rmse

68489.15096941793

The result of SVR is worse than `RandomForestRegressor` (The random forest best score is 47560.17, but for SVR we trained on much fewer data).

In [9]:
grid_search.best_params_

{'svr__C': 30000.0, 'svr__kernel': 'linear'}

## 2. RandomizedSearchCV

In [10]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, loguniform

param_random = {
    'svr__kernel': ['linear', 'rbf'],
    'svr__C': loguniform(20, 200_000),
    'svr__gamma': expon(scale=1.0)
}

rnd_search = RandomizedSearchCV(
    svr_pipeline,
    param_distributions=param_random,
    n_iter=50,
    cv=3,
    scoring='neg_root_mean_squared_error',
    random_state=42,
)
rnd_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [11]:
svr_rnd_search_rmse = -rnd_search.best_score_
svr_rnd_search_rmse

56760.0443956341

Better than last with grid search, but still far from the `RandomForestRegressor`.

## Try SelectFromModel

In [12]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

selector_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('selector', SelectFromModel(
        RandomForestRegressor(random_state=42),
        threshold=0.015 # min feature importance
    )),
    ('svr', SVR(
        C=rnd_search.best_params_['svr__C'],
        gamma=rnd_search.best_params_['svr__gamma'],
        kernel=rnd_search.best_params_['svr__kernel']
    ))
])

In [13]:
from sklearn.model_selection import cross_val_score

selector_rmses = -cross_val_score(
    selector_pipeline,
    housing.iloc[:5000],
    housing_labels.iloc[:5000],
    scoring='neg_root_mean_squared_error',
    cv=3
)
pd.Series(selector_rmses).describe()

count        3.000000
mean     56435.865714
std       2607.834153
min      54109.908919
25%      55026.239644
50%      55942.570370
75%      57598.844112
max      59255.117854
dtype: float64

## 4. Complete Pipeline

In [17]:
from sklearn.base import MetaEstimatorMixin, BaseEstimator, TransformerMixin, clone
from sklearn.utils.validation import check_is_fitted

class FeatureFromRegressor(MetaEstimatorMixin, BaseEstimator, TransformerMixin):
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None):
        estimator_ = clone(self.estimator)
        estimator_.fit(X, y)
        self.estimator_ = estimator_
        self.n_features_in_ = self.estimator_.n_features_in_
        if hasattr(self.estimator, 'features_names_in_'):
            self.feature_names_in_ = self.estimator.features_names_in_
        return self

    def transform(self, X):
        check_is_fitted(self)
        predictions = self.estimator_.predict(X)
        if predictions.ndim == 1:
            predictions = predictions.reshape(-1, 1)
        return predictions

    def get_feature_names_out(self, _=None):
        check_is_fitted(self)
        n_outputs = getattr(self.estimator_, 'n_outputs_', 1)
        estimator_class_name = self.estimator_.__class__.__name__
        estimator_short_name = estimator_class_name.lower().replace('_', '')
        return [
            f'{estimator_short_name}_predictions_{index}'
            for index in range(n_outputs)
        ]


In [18]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils.estimator_checks import check_estimator

check_estimator(FeatureFromRegressor(KNeighborsRegressor()))

In [19]:
knn_reg = KNeighborsRegressor(n_neighbors=3, weights='distance')
knn_transformer = FeatureFromRegressor(knn_reg)
geo_features = housing[['latitude', 'longitude']]
knn_transformer.fit_transform(geo_features, housing_labels)

array([[ 68850.],
       [279600.],
       [ 79000.],
       ...,
       [135700.],
       [258100.],
       [ 62700.]])

In [20]:
knn_transformer.get_feature_names_out()

['kneighborsregressor_predictions_0']

In [21]:
transformers = [
    (name, clone(transformer), columns)
    for name, transformer, columns in preprocessing.transformers
]
geo_index = [name for name, _, _ in transformers].index('geo')
transformers[geo_index] = ('geo', knn_transformer, ['latitude', 'longitude'])

new_geo_preprocessing = ColumnTransformer(transformers)

In [22]:
new_geo_pipeline = Pipeline([
    ('preprocessing', new_geo_preprocessing),
    ('svr', SVR(
        C=rnd_search.best_params_['svr__C'],
        gamma=rnd_search.best_params_['svr__gamma'],
        kernel=rnd_search.best_params_['svr__kernel']
    ))
])

In [23]:
new_pipe_rmses = -cross_val_score(
    new_geo_pipeline,
    housing.iloc[:5000],
    housing_labels.iloc[:5000],
    scoring='neg_root_mean_squared_error',
    cv=3
)
pd.Series(new_pipe_rmses).describe()

count         3.000000
mean     104487.504409
std        2940.590141
min      101582.912110
25%      102999.848280
50%      104416.784451
75%      105939.800558
max      107462.816665
dtype: float64

## 5 Automatically explore some preparation options using RandomSearchCV.

In [24]:
param_distribs = {
    'preprocessing__geo__estimator__n_neighbors': range(1, 30),
    'preprocessing__geo__estimator__weights': ['distance', 'uniform'],
    'svr__C': loguniform(20, 200_000),
    'svr__gamma': expon(scale=1.0),
}

new_geo_rnd_search = RandomizedSearchCV(
    new_geo_pipeline,
    param_distributions=param_distribs,
    n_iter=50,
    cv=3,
    scoring='neg_root_mean_squared_error',
    random_state=42
)
new_geo_rnd_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])

In [25]:
new_geo_rnd_search_rmse = -new_geo_rnd_search.best_score_
new_geo_rnd_search_rmse

106367.27398279442

## 6. Try to implement the StandardScalerClone

In [27]:
from sklearn.utils.validation import check_array

class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=True):
        self.with_mean = with_mean

    def fit(self, X, y=None):
        X_orig = X
        X = check_array(X)
        self.mean_ = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        self.n_features_in_ = X.shape[1]
        if hasattr(X_orig, 'columns'):
            self.feature_names_in_ = np.array(X_orig.columns, dtype=object)
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = check_array(X)
        if self.n_features_in_ != X.shape[1]:
            raise ValueError('Unexpected number of features')
        if self.with_mean:
            X = X - self.mean_
        return X / self.scale_

    def inverse_transform(self, X):
        check_is_fitted(self)
        X = check_array(X)
        if self.n_features_in_ != X.shape[1]:
            raise ValueError('Unexpected number of features')
        X = X * self.scale_
        return X + self.mean_ if self.with_mean else X

    def get_feature_names_out(self, input_features=None):
        if input_features is None:
            return getattr(
                self,
                'feature_names_in_',
                [f'x{index}' for index in range(self.n_features_in_)]
            )

        if len(input_features) != self.n_features_in_:
            raise ValueError('Invalid number of features')

        if hasattr(self, 'features_names_in_') and not np.all(self.feature_names_in_ == input_features):
            raise ValueError('input_features != feature_names_in_')
        return input_features


In [28]:
check_estimator(StandardScalerClone())

In [30]:
np.random.seed(42)
X = np.random.rand(1000, 3)

scaler = StandardScalerClone()
X_scaled = scaler.fit_transform(X)

assert np.allclose(X_scaled, (X - X.mean(axis=0)) / X.std(axis=0))

In [31]:
# with_mean=False

scaler = StandardScalerClone(with_mean=False)
X_scaled_uncentered = scaler.fit_transform(X)

assert np.allclose(X_scaled_uncentered, X / X.std(axis=0))

In [32]:
# inverse

scaler = StandardScalerClone()
X_back = scaler.inverse_transform(scaler.fit_transform(X))

assert np.allclose(X, X_back)

In [33]:
assert np.all(scaler.get_feature_names_out() == ['x0', 'x1', 'x2'])
assert np.all(scaler.get_feature_names_out(['a', 'b', 'c']) == ['a', 'b', 'c'])

In [34]:
df = pd.DataFrame({ 'a': np.random.rand(100), 'b': np.random.rand(100) })
scaler = StandardScalerClone()
X_scaled = scaler.fit_transform(df)

assert np.all(scaler.feature_names_in_ == ['a', 'b'])
assert np.all(scaler.get_feature_names_out() == ['a', 'b'])