In [None]:
from pathlib import Path
import os

import cartopy.crs as ccrs
import cartopy.feature as cfeature
import cartopy.io.shapereader as shpreader
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
%matplotlib inline

In [None]:
SEED = 51

BOOK_FILES_DATASETS_BASE = Path(os.environ.get('BOOK_FILES_DATASETS_BASE'))
HOUSING_DATA_PATH = BOOK_FILES_DATASETS_BASE/'housing/housing.csv'

In [None]:
def load_housing_data(housing_data_path=HOUSING_DATA_PATH):
    return pd.read_csv(housing_data_path)

In [None]:
df_housing = load_housing_data()

# EDA

In [None]:
df_housing.head()

In [None]:
df_housing.info()

In [None]:
df_housing.ocean_proximity.value_counts()

In [None]:
df_housing.describe()

In [None]:
df_housing.hist(bins=50, figsize=(20,15), grid=False)
plt.show()

In [None]:
df_housing['income_cat'] = pd.cut(
    df_housing.median_income,
    bins=[0, 1.5, 3, 4.5, 6, np.inf],
    labels=[1, 2, 3, 4, 5],
)

In [None]:
df_housing.income_cat.hist(grid=False)
plt.show()

In [None]:
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
for train_indices, test_indices in splitter.split(df_housing, df_housing.income_cat):
    df_housing_train = df_housing.loc[train_indices]
    df_housing_test = df_housing.loc[test_indices]

In [None]:
def get_income_cat_distribution(df):
    return df.income_cat.value_counts() / len(df.index)


pd.DataFrame({
    'raw': get_income_cat_distribution(df_housing),
    'train': get_income_cat_distribution(df_housing_train),
    'test': get_income_cat_distribution(df_housing_test),
})

In [None]:
df_housing_train = df_housing_train.drop(columns=['income_cat'])
df_housing_test = df_housing_test.drop(columns=['income_cat'])

In [None]:
df_housing_train_copy = df_housing_train.copy()  # For messing around with

In [None]:
fig = plt.figure(figsize=(10,10))

ax = fig.add_axes([0,0,1,1], projection=ccrs.LambertConformal(), frameon=False)

ax.set_extent([-125, -113, 32, 43])
ax.add_feature(cfeature.OCEAN)
ax.add_feature(cfeature.LAND)
ax.add_feature(cfeature.RIVERS)
ax.add_feature(cfeature.LAKES)

shapename = 'admin_1_states_provinces'
states_shp = shpreader.natural_earth(
    resolution='50m',
    category='cultural',
    name=shapename,
)
reader = shpreader.Reader(states_shp)
states = reader.records()

for state in states:
    if state.attributes['name'] == 'California':
        ax.add_geometries(state.geometry, ccrs.PlateCarree(), color='w', alpha=0.5)
        
ax.scatter(
    df_housing_train_copy.longitude,
    df_housing_train_copy.latitude,
    c=df_housing_train_copy.median_house_value,
    s=df_housing_train_copy.population/100,
    alpha=0.3,
    transform=ccrs.PlateCarree(),
    zorder=2,
)

In [None]:
housing_train_corr_matrix = df_housing_train.corr()
housing_train_corr_matrix

In [None]:
housing_train_corr_matrix.median_house_value.sort_values(ascending=False)

In [None]:
potential_features = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']
scatter_matrix(df_housing_train[potential_features], figsize=(12,8))
plt.show()

In [None]:
df_housing_train.plot(kind='scatter', x='median_income', y='median_house_value', alpha=0.3)

In [None]:
df_housing['rooms_per_houseold'] = df_housing.total_rooms / df_housing.households
df_housing['bedrooms_per_room'] = df_housing.total_bedrooms / df_housing.total_rooms
df_housing['population_per_household'] = df_housing.population / df_housing.households

In [None]:
df_housing_corr_matrix = df_housing.corr()
df_housing_corr_matrix.median_house_value.abs().sort_values(ascending=False)

# Modelling

In [None]:
df_housing_train_labels = df_housing_train.median_house_value
df_housing_train = df_housing_train.drop(columns=['median_house_value'])

In [None]:
imputer = SimpleImputer(strategy='median')

df_housing_train_numerical = df_housing_train.drop(columns=['ocean_proximity'])

imputer.fit(df_housing_train_numerical)

In [None]:
X = imputer.transform(df_housing_train_numerical)
df_housing_train_numerical_imputed = pd.DataFrame(
    data=X,
    columns=df_housing_train_numerical.columns,
    index=df_housing_train_numerical.index,
)

In [None]:
df_housing_train_categorical = df_housing_train[['ocean_proximity']]

In [None]:
df_housing_train_categorical.head()

In [None]:
one_hot_encoder = OneHotEncoder()
df_housing_train_categorical_one_hot_encoded = pd.DataFrame(
    one_hot_encoder.fit_transform(df_housing_train_categorical).toarray().astype('uint'),
    columns=one_hot_encoder.categories_,
    index=df_housing_train_categorical.index,
)
df_housing_train_categorical_one_hot_encoded

In [None]:
ROOMS_INDEX = 3
BEDROOMS_INDEX = 4
POPULATION_INDEX = 5
HOUSEHOLDS_INDEX = 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    
    def fit(self, X, y=None):
        return self  # Nothing else to do
    
    def transform(self, X):
        rooms_per_household = X[:,ROOMS_INDEX] / X[:,HOUSEHOLDS_INDEX]
        population_per_household = X[:,POPULATION_INDEX] / X[:,HOUSEHOLDS_INDEX]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:,BEDROOMS_INDEX] / X[:,ROOMS_INDEX]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
numerical_attributes = df_housing_train_numerical.columns
categorical_attributes = df_housing_train_categorical.columns

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attributes_adder', CombinedAttributesAdder()),
    ('standard_scaler', StandardScaler()),
])

categorical_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder())
])

preprocessing_pipeline = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_attributes),
    ('categorical', categorical_pipeline, categorical_attributes),
])

end_to_end_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('model', RandomForestRegressor()),
])

In [None]:
param_grid = [
    {'model__n_estimators': [3, 10, 30], 'model__max_features': [2, 4, 6, 8]},
    {'model__bootstrap': [False], 'model__n_estimators': [3, 10], 'preprocessing__numerical__attributes_adder__add_bedrooms_per_room': [False]},
]

grid_search = GridSearchCV(
    end_to_end_pipeline, 
    param_grid, 
    cv=5, 
    scoring='neg_mean_squared_error', 
    return_train_score=True,
)

In [None]:
grid_search.fit(df_housing_train, df_housing_train_labels)
grid_search.best_estimator_