In [1]:
%%html
<style>
.container{width: 100%}
</style>

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import warnings 
warnings.filterwarnings("ignore")

In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
import os
os.sys.path.insert(0, "../")

### Load Data

In [6]:
from tools import load_boston
data, desc = load_boston("../data_base")
data = data.rename(columns = {"target": "MEDV"})
features = data.drop("MEDV", axis = 1)
prices = data.MEDV

In [7]:
print(desc)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

### Feature Engineering

#### Train Test Splitting

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size = .3, random_state = 42)
X_select, X_train, y_select, y_train = train_test_split(features, prices, test_size = .4, random_state = 42)

#### DIS

In [9]:
from tools import cal_benchmark_perf
from sklearn.model_selection import train_test_split
from tools import cal_perf

ALPHA = np.power(10, np.linspace(-1.5, 0.5, 20))
param_grid = {
    "alpha": ALPHA
}

In [10]:
_X_train, _X_test, _y_train, _y_test = train_test_split(X_select, y_select, test_size = .5, random_state = 42)
before_perf = cal_perf(_X_train, _y_train, _X_test, _y_test, param_grid, return_grid = False)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    1.6s finished


In [11]:
def add_logged_dis(features):
    features["logged_dis"] = np.log(features.DIS)
    return

add_logged_dis(X_select)

In [12]:
_X_train, _X_test, _y_train, _y_test = train_test_split(X_select, y_select, test_size = .5, random_state = 42)
after_perf = cal_perf(_X_train, _y_train, _X_test, _y_test, param_grid, return_grid = False)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.1s finished


In [13]:
print(after_perf / before_perf)

0.8589130044539982


#### Add Features

In [14]:
for X in [X_select, X_train, X_test]:
    add_logged_dis(X)

#### Calculate Benchmark

In [15]:
from tools import cal_benchmark_perf
benchmark = cal_benchmark_perf(X_train, y_train, X_test, y_test)

##### Lasso

In [16]:
ALPHA = np.power(10, np.linspace(-1.5, 0.5, 20))
param_grid = {
    "alpha": ALPHA
}

In [17]:
from tools import cal_perf
mod_perf, grid = cal_perf(X_train, y_train, X_test, y_test, param_grid, return_grid=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.1s finished


In [18]:
grid.best_estimator_.coef_

array([-1.43203767e-01,  3.92145073e-02, -8.06737738e-02,  8.39251196e-01,
       -1.72424972e+01,  3.74004767e+00,  9.52801719e-03,  5.71369455e-01,
        3.80746994e-01, -1.90651849e-02, -8.84241179e-01,  6.18531597e-03,
       -5.01548006e-01, -9.22170237e+00])

In [19]:
grid.best_params_

{'alpha': 0.03162277660168379}