## Import data

In [1]:
import os
import pandas as pd
HOUSING_PATH = "datasets/housing"
def load_housing_data():
     housing_path = HOUSING_PATH
     csv_path = os.path.join(housing_path, "housing.csv")
     return pd.read_csv(csv_path)

# All data are stored in housing using panda
housing = load_housing_data()

## Stratified sampling

In [2]:
import numpy as np

# Cap the outlier
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

# Stratified sampling
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
     strat_train_set = housing.loc[train_index]
     strat_test_set = housing.loc[test_index]
    
# Drop "income_cat" attribute    
for set in (strat_train_set, strat_test_set):
     set.drop(["income_cat"], axis=1, inplace=True)

In [3]:
# Make a copy of the data in housing
housing = strat_train_set.copy()

## Data Cleaning

In [4]:
# Remove the predictors, namely "median_house_value"
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [5]:
# Drop text attributes, namely "ocean_proximity", to compute median and apply them to missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
imputer.fit(housing_num)
X = imputer.transform(housing_num)

In [7]:
# Encode the text attribute "ocean_proximity" to binary values.
from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelBinarizer

# Use MyLabelBinarizer to solve further error in the full pipeline
class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)

housing_cat = housing["ocean_proximity"]
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       ...,
       [0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0]])

In [8]:
# Create a custom transformer to add add_bedrooms_per_room attribute
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
     def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
         self.add_bedrooms_per_room = add_bedrooms_per_room
     def fit(self, X, y=None):
         return self # nothing else to do
     def transform(self, X, y=None):
         rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
         population_per_household = X[:, population_ix] / X[:, household_ix]
         if self.add_bedrooms_per_room: # add_bedrooms_per_room is a helpful attribute
             bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
             return np.c_[X, rooms_per_household, population_per_household,
 bedrooms_per_room]
         else: # add_bedrooms_per_room is NOT a helpful attribute
             return np.c_[X, rooms_per_household, population_per_household]

In [9]:
# Apply the Custom transformer
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [10]:
# Convert Pandas DataFrames to NumPy Array for Scikit-Learn to handle
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
     def __init__(self, attribute_names):
         self.attribute_names = attribute_names
     def fit(self, X, y=None):
         return self
     def transform(self, X):
         return X[self.attribute_names].values

## The whole pipeline to preprocess the data

In [11]:
# The whole pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([ # only handle numerical attributes
     ('selector', DataFrameSelector(num_attribs)), # convert format
     ('imputer', SimpleImputer(strategy="median")), # take care missing values with median
     ('attribs_adder', CombinedAttributesAdder()), # custom transformer, add add_bedrooms_per_room attribute
     ('std_scaler', StandardScaler()), # Standardize the scale of the features, using Z-score
     ])

cat_pipeline = Pipeline([ # only handle text attributes
     ('selector', DataFrameSelector(cat_attribs)), # convert format
     ('label_binarizer', MyLabelBinarizer()), # tranform text to binary values
     ])

full_pipeline = FeatureUnion(transformer_list=[ # combine the two pipelines together
     ("num_pipeline", num_pipeline),
     ("cat_pipeline", cat_pipeline),
     ])

In [12]:
# Activate the pipeline
housing_prepared=full_pipeline.fit_transform(housing)
housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

## Select and Train models

**Random Forest Regressor** found out to work the best, out of **Linear Regression** and **Decision Tree Regressor**

In [13]:
# Train a random forest regressor model
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

RandomForestRegressor()

In [14]:
# Measure RMSE on Random Forest Regressor on the whole training set
from sklearn.metrics import mean_squared_error
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

18743.062606634485

In [15]:
# Cross-validation on Random Forest Regressor into 10 folds
from sklearn.model_selection import cross_val_score
scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)

In [16]:
# Display the scores
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(forest_rmse_scores)

Scores: [49438.73796543 47694.77847793 50067.05396792 51802.82440341
 49395.38235167 53179.61785841 48697.52617636 48133.77775024
 52699.93659918 50151.31839433]
Mean: 50126.09539448796
Standard deviation: 1780.2259290693362


## Fine tune the model

In [17]:
# Grid Search for different combinations of features
from sklearn.model_selection import GridSearchCV
param_grid = [
     {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
     {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
     ]

forest_reg = RandomForestRegressor()

# Cross-validation on all of them, each into 5 folds
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             scoring='neg_mean_squared_error')

In [18]:
# Show all 18 trials and the best one of them
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

grid_search.best_params_

grid_search.best_estimator_

63996.42031670111 {'max_features': 2, 'n_estimators': 3}
55382.27392107797 {'max_features': 2, 'n_estimators': 10}
53089.86076658032 {'max_features': 2, 'n_estimators': 30}
61600.72313352143 {'max_features': 4, 'n_estimators': 3}
53144.33440366666 {'max_features': 4, 'n_estimators': 10}
50353.971614412934 {'max_features': 4, 'n_estimators': 30}
59028.866781527584 {'max_features': 6, 'n_estimators': 3}
52207.13364267813 {'max_features': 6, 'n_estimators': 10}
50124.34007816165 {'max_features': 6, 'n_estimators': 30}
58329.99478766437 {'max_features': 8, 'n_estimators': 3}
52389.19454296688 {'max_features': 8, 'n_estimators': 10}
49918.45430227175 {'max_features': 8, 'n_estimators': 30}
63343.04748405627 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54069.767647860506 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59914.45584325411 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
53253.96803854077 {'bootstrap': False, 'max_features': 3, 'n_estimators':

RandomForestRegressor(max_features=8, n_estimators=30)

## Analyze the best model and their errors

In [19]:
# Show the importance of each features
feature_importances = grid_search.best_estimator_.feature_importances_
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.3493072586136655, 'median_income'),
 (0.18935009400466538, 'INLAND'),
 (0.11282515458312607, 'pop_per_hhold'),
 (0.06596868053145558, 'longitude'),
 (0.06108288002145537, 'latitude'),
 (0.0595204455192039, 'rooms_per_hhold'),
 (0.052624673263719564, 'bedrooms_per_room'),
 (0.043538273462270266, 'housing_median_age'),
 (0.015916078195180077, 'total_rooms'),
 (0.014913039390783828, 'population'),
 (0.01474036012517624, 'total_bedrooms'),
 (0.01308212710098051, 'households'),
 (0.0031168965689471776, 'NEAR OCEAN'),
 (0.002371399619274514, '<1H OCEAN'),
 (0.0015365620823453994, 'NEAR BAY'),
 (0.00010607691775060844, 'ISLAND')]

## Evaluate the System on the Test Set

In [20]:
# Take the best model
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

# Go through the pipeline
X_test_prepared = full_pipeline.transform(X_test)

# Activate predict transform
final_predictions = final_model.predict(X_test_prepared)

# Measure RMSE on the prediction and the actual values
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

final_rmse

47747.40157214418