In [1]:
# Exercises for chapter two of Hands-On Machine Learning


In [2]:
#Housing dataset

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
import os

In [5]:
import sklearn

In [6]:
import seaborn as sb

In [7]:
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)


housing = load_housing_data()

In [8]:
#explore the data

In [9]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [10]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [11]:
#mising values for total_bedrooms 
#one categorical feature
#features need to be combined to be useful: total_bedrooms, total_rooms, households

In [12]:
#make training and test sets
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

In [13]:
#stratified sampling for median income
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [14]:
strat_train_set.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'income_cat'],
      dtype='object')

In [15]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [16]:
strat_train_set.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [17]:
housing = strat_train_set.copy()

In [18]:
# experimenting with some combinations
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [19]:
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy() # start from clean dataset

In [20]:
#fill in missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

housing_num = housing.drop("ocean_proximity", axis=1)

imputer.fit(housing_num)

X = imputer.transform(housing_num) # can use this trained inputer in the future!!!!

# put back into pandas
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing_num.index)

In [21]:
#process categorical feature
housing_cat = housing[["ocean_proximity"]]

from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

cat_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [None]:
# Make custom transformers

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin

In [26]:
#number/index the columns
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
housing.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity'],
      dtype='object')

In [28]:
# create a class object and implement methods fit(), transform(), fit_transform()
#will return rooms_per_household and population_per_household
#optionally will return bedrooms_per_household
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    #executed when this class is initiated. Doesnt need to be called by user. 
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        #user sets this parameter to True or False
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        #combine attributes to get a new one (average rooms per house hold)
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        #second new feature
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: 
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [30]:
#initialize and utilize the class object
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [31]:
# convert to a dataframe
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
    index=housing.index)
housing_extra_attribs.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,population_per_household
17606,-121.89,37.29,38,1568,351,710,339,2.7042,<1H OCEAN,4.62537,2.0944
18632,-121.93,37.05,14,679,108,306,113,6.4214,<1H OCEAN,6.00885,2.70796
14650,-117.2,32.77,31,1952,471,936,462,2.8621,NEAR OCEAN,4.22511,2.02597
3230,-119.61,36.31,25,1847,371,1460,353,1.8839,INLAND,5.23229,4.13598
3555,-118.59,34.23,17,6592,1525,4459,1463,3.0347,<1H OCEAN,4.50581,3.04785


# preprocess numerical attributes

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [33]:
#utilize the pipeline functionality

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")), #impute the median value to missing values
        ('attribs_adder', CombinedAttributesAdder()), #combine the rooms attributes via class object
        ('std_scaler', StandardScaler()), #scale the attributes
    ])

In [34]:
#run pipeline
housing_num_tr = num_pipeline.fit_transform(housing_num)

(16512, 8)

In [35]:
from sklearn.compose import ColumnTransformer

In [38]:
#get full lists of new attributes
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

#create a pipeline which uses previous pipeline
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

#generate a new array with preprocessed cat and num attributes using the pipelines and class object
housing_prepared = full_pipeline.fit_transform(housing)

In [39]:
(housing_prepared.shape,housing.shape)

((16512, 16), (16512, 9))

In [40]:
housing_labels.shape

(16512,)

# exercises 

#1 Try an SVM with various hyperparameters for kernel (linear vs rbf), C, gamma. How does the best SVR perform? 

In [43]:
#load metric: RMSE MAE
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn import svm

In [None]:
#SV_Reg = svm.SVR()
#SV_Reg.fit(housing_prepared, housing_labels, kernel='rbf', C=.1.0, gamma='scale')

In [None]:
#SV_Reg = svm.SVR()
#SV_Reg.fit(housing_prepared, housing_labels, kernel='linear', C=1.0')

In [44]:
from sklearn.model_selection import GridSearchCV

In [45]:
#create a parameter grid to use in grid search
param_grid= [
    {'kernel':['linear'], 'C':[0.1, 0.5, 1.0, 2.0, 5]},
    {'kernel':['rbf'], 'C':[0.1, 0.5, 1.0, 2.0, 5], 'gamma':['scale', 'auto']}
]

In [50]:
#initialize SVR
SV_Reg = svm.SVR()

#initialize the grid search
grid_search = GridSearchCV(SV_Reg, param_grid, cv=5,
                        scoring='neg_mean_squared_error',
                        return_train_score=True)

#conduct grid search
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid=[{'C': [0.1, 0.5, 1.0, 2.0, 5], 'kernel': ['linear']},
                         {'C': [0.1, 0.5, 1.0, 2.0, 5],
                          'gamma': ['scale', 'auto'], 'kernel': ['rbf']}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [51]:
#return best parameters
grid_search.best_params_

{'C': 5, 'kernel': 'linear'}

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip()