# Get the Data

## Download the Data

In [1]:
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [2]:
fetch_housing_data()

In [3]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [4]:
housing = load_housing_data()

In [5]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [7]:
final_housing = housing

## Transformation Pipelines

In [8]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [9]:
housing = housing.drop("median_house_value", axis=1)

In [10]:
housing_num = housing.drop("ocean_proximity", axis=1)
housing_cat = housing[["ocean_proximity"]]

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]

        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

In [13]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

## Splitting data into train and test datasets

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X = final_housing.drop('median_house_value', axis = 1)
y = final_housing['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

## Building Pipeline

### Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression
pipe = Pipeline(steps=[
    ('preprocessor', full_pipeline),
    ('regressor', LinearRegression())])
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

0.6548106094521062

### Decision Tree

In [17]:
from sklearn.tree import DecisionTreeRegressor
pipe = Pipeline(steps=[
    ('preprocessor', full_pipeline),
    ('regressor', DecisionTreeRegressor())])
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

1.0

### Random Forest

In [18]:
from sklearn.ensemble import RandomForestRegressor
pipe = Pipeline(steps=[
    ('preprocessor', full_pipeline),
    ('regressor', RandomForestRegressor())])
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

0.9736799554458297

## Using Pipelines in Grid Searches

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'regressor__n_estimators': [3, 10, 30], 
     'regressor__max_features': [2, 4, 6, 8]},
    {'regressor__bootstrap': [False], 
     'regressor__n_estimators': [3, 10], 
     'regressor__max_features': [2, 3, 4]},
  ]

In [20]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))

Best cross-validation accuracy: 0.81
Test set score: 0.81
Best parameters: {'regressor__max_features': 8, 'regressor__n_estimators': 30}


## Convenient Pipeline Creation with make_pipeline

### Support Vector Machine Regressor

In [21]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

In [22]:
pipe = make_pipeline(full_pipeline,SVR(),memory="cache_folder")

In [23]:
pipe.fit(X_train, y_train)
pipe.score(X_train, y_train)

-0.04702238096456268

## Grid-Searching Which Model To Use

In [24]:
pipe = Pipeline(steps=[
    ('preprocessor', full_pipeline),
    ('regressor', RandomForestRegressor())],
                memory="cache_folder")

In [25]:
param_grid = [
    {'regressor': [RandomForestRegressor()],
     'regressor__n_estimators': [3, 10, 30], 
     'regressor__max_features': [2, 4, 6, 8]},
    {'regressor': [RandomForestRegressor()],
     'regressor__bootstrap': [False], 
     'regressor__n_estimators': [3, 10], 
     'regressor__max_features': [2, 3, 4]},
    {'regressor': [SVR()],
     'regressor__kernel': ['linear'],
     'regressor__C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
    {'regressor': [SVR()],
     'regressor__kernel': ['rbf'],
     'regressor__C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
     'regressor__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]}]

In [26]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Best params:
{'regressor': RandomForestRegressor(max_features=8, n_estimators=30), 'regressor__max_features': 8, 'regressor__n_estimators': 30}

Best cross-validation score: 0.81
Test-set score: 0.81
