In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib
import tarfile

In [None]:
housing_url = 'https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz'
urllib.request.urlretrieve(housing_url, 'housing.tgz')

housing_tgz = tarfile.open('housing.tgz')
housing_tgz.extractall()
housing_tgz.close()

housing = pd.read_csv('housing.csv')
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
# Split the data into a training set and a test set.
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2)
len(train_set), len(test_set)

(16512, 4128)

In [None]:
# Separate the input features and the output feature.
train_set_output = train_set[['median_house_value']].copy()
train_set_input = train_set.drop('median_house_value', axis=1)
test_set_output = test_set[['median_house_value']].copy()
test_set_input = test_set.drop('median_house_value', axis=1)
print(train_set_output.shape, train_set_input.shape)

(16512, 1) (16512, 9)


In [None]:
# Perform data transformations
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

from sklearn.base import BaseEstimator, TransformerMixin
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household,
                        bedrooms_per_room]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income']
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [None]:
train_prepared = full_pipeline.fit_transform(train_set_input)
test_prepared = full_pipeline.transform(test_set_input)
train_prepared[:3, :]

array([[-0.06668853, -0.56216755,  0.8263536 , -0.52604788, -0.16712639,
        -0.59283784, -0.23833773, -0.62194201, -0.72775627, -0.10814739,
         1.59278879,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.57767424, -0.75424262,  0.50808456, -0.41563262, -0.22706246,
        -0.23707807, -0.20938063, -0.0772852 , -0.53230161, -0.02960142,
         0.66761929,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.12713086, -0.57153706, -1.16282788,  0.17340082,  0.25482355,
         0.1848901 ,  0.21707842, -1.23022524, -0.09631827, -0.02512953,
        -0.00953592,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ]])

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
svm_reg = SVR()

param_grid = [
    {'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000]},
    {'kernel': ['rbf'], 'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.1, 0.01, 0.001, 0.0001]}
]


grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(train_prepared, train_set_output.values.ravel())
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)
best_svm_reg = grid_search.best_estimator_
test_predictions = best_svm_reg.predict(test_prepared)



Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best Hyperparameters: {'C': 1000, 'kernel': 'linear'}


In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt


best_svm_reg = grid_search.best_estimator_
test_predictions = best_svm_reg.predict(test_prepared)
mse = mean_squared_error(test_set_output, test_predictions)
rmse = sqrt(mse)

print("RMSE:", rmse)


RMSE: 71636.67736418292
