Boston Housing Dataset Test
===========================

In this notebook we test revrand's ARD basis functions on the Boston housing dataset.

In [58]:
from functools import partialmethod

import numpy as np
from scipy.stats import gamma

from sklearn.datasets import load_boston, load_diabetes
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, RBF, DotProduct
from sklearn.svm import SVR

from revrand import StandardLinearModel, GeneralizedLinearModel
import revrand.basis_functions as bf
from revrand.likelihoods import Gaussian
from revrand.metrics import smse, msll
from revrand.btypes import Parameter, Positive, Bound
from revrand.optimize import Adam

# Log output to the terminal attached to this notebook
#import logging
#logger = logging.getLogger()
#logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)

## Settings

In [59]:
data = load_boston()
#data = load_diabetes()

folds = 5
random_state = 100

# Revrand
nbases = 800
lenscale = gamma(1., scale=20)
maxiter = 5000
nsamples = 50
updater = Adam(epsilon=1e-8)

# GP
length_scales = 1.
n_restarts = 1

# Random Forest
n_estimators = 40

# SVR
svr_params = {
    'svr__gamma': np.logspace(-1, 2, 20),
    'svr__epsilon': np.logspace(-5, 0, 10)
}

## Load data

In [60]:
# Load the data
X = data.data
y = data.target - data.target.mean()
N, D = X.shape

## Construct bases and kernels

In [61]:
# Construct basis functions
base = bf.RandomRBF(Xdim=D,
                    nbases=nbases,
                    lenscale_init=Parameter(lenscale, Positive(), shape=(D,)),
                    random_state=random_state
                    ) + bf.LinearBasis(onescol=True)

# Construct Kernel
kern = 1**2 * RBF(length_scale=length_scales * np.ones(D),
           length_scale_bounds=(1e-2, 1e3)
          ) + DotProduct() + WhiteKernel()

## Construct Estimators

In [62]:
# Revrand
class SLM(StandardLinearModel):
    def predict(self, X):
        return super().predict_moments(X)
    
slm = make_pipeline(StandardScaler(), SLM(base, random_state=random_state))

class GLM(GeneralizedLinearModel):
    def predict(self, X):
        Ey, Vy = super().predict_moments(X)
        Vy += self.like_hypers_
        return Ey, Vy
    
glm = make_pipeline(StandardScaler(), GLM(Gaussian(), base, random_state=random_state,
                                          nsamples=nsamples, maxiter=maxiter, updater=updater))

# Gaussian Process
class GP(GaussianProcessRegressor):
    def predict(self, X):
        return super().predict(X, return_std=True)

gp = make_pipeline(StandardScaler(), GP(kernel=kern, n_restarts_optimizer=n_restarts, random_state=random_state))

# RandomForest
rf = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=n_estimators, random_state=random_state))

# SVR
svr = GridSearchCV(make_pipeline(StandardScaler(), SVR()), svr_params, n_jobs=-1)

## Validation

In [63]:
# Convenience functions
def score(y_true, y_pred, y_var, y_train, scores):
    sc_smse = smse(y_true, y_pred)
    sc_msll = msll(y_true, y_pred, y_var, y_train)
    sc_r2 = r2_score(y_true, y_pred)
    
    scores.append((sc_smse, sc_msll, sc_r2))
    
    
def print_score(alg_name, scores):
    
    scores = np.array(scores)
    smse = scores[:, 0]
    msll = scores[:, 1]
    r2 = scores[:, 2]
    
    print("{}:\n\tR2 = {:.4f} ({:.4f}),\n\tSMSE = {:.4f} ({:.4f}),\n\tMSLL = {:.4f} ({:.4f})"
          .format(alg_name,
                  np.mean(r2), np.std(r2),
                  np.mean(smse), np.std(smse),
                  np.mean(msll), np.std(msll)
                 )
         )

In [64]:
# Cross val
slm_score = []
glm_score = []
gp_score = []
rf_score = []
svr_score = []

foldgen = KFold(n_splits=folds, shuffle=True, random_state=random_state)
for i, (tr_ind, ts_ind) in enumerate(foldgen.split(X)):
    
    # Revrand
    slm.fit(X[tr_ind], y[tr_ind])
    Ey, Vy = slm.predict(X[ts_ind])
    score(y[ts_ind], Ey, Vy, y[tr_ind], slm_score)
    
    glm.fit(X[tr_ind], y[tr_ind])
    Ey, Vy = glm.predict(X[ts_ind])
    score(y[ts_ind], Ey, Vy, y[tr_ind], glm_score)
    
    # GP
    gp.fit(X[tr_ind], y[tr_ind])
    Ey, Sy = gp.predict(X[ts_ind])
    score(y[ts_ind], Ey, Sy**2, y[tr_ind], gp_score)
    
    # RF
    rf.fit(X[tr_ind], y[tr_ind])
    Ey = rf.predict(X[ts_ind])
    score(y[ts_ind], Ey, y[tr_ind].var(), y[tr_ind], rf_score)
    
    # SVR
    svr.fit(X[tr_ind], y[tr_ind])
    Ey = svr.predict(X[ts_ind])
    score(y[ts_ind], Ey, y[tr_ind].var(), y[tr_ind], svr_score)
    
    print("Fold {}".format(i + 1))

INFO:revrand.optimize.decorators:Evaluating random starts...
INFO:revrand.slm:ELBO = -1487.3814613897607, var = 3.845476409979971, reg = 1.6696293576221093, bparams = [  3.77341773  33.88083457   6.40613699  11.30226385  56.27815668
  34.0364791    8.19283487   3.85739072   9.33081811   0.11409497
   5.8184491   31.75964564   0.30745051].
INFO:revrand.slm:ELBO = -3028.840266487172, var = 0.9134033467068555, reg = 0.9258476012266578, bparams = [  2.22193172   9.62350625   0.74315882  44.22046832  79.18319062
   1.23627383  44.24500809  17.2030053   27.13313369  19.89499045
  17.43792775   0.41301803   4.71511953].
INFO:revrand.slm:ELBO = -4983.079543278535, var = 0.7867655243193307, reg = 1.4658362690057338, bparams = [  5.77218949   6.73452471  38.26432174  73.78278464  43.2309651
   8.91036809  18.26884315   8.76376262   8.31607381   3.92226832
   5.42815006   0.91799484  14.08138943].
INFO:revrand.slm:ELBO = -8137.484805291579, var = 0.47200956666942107, reg = 0.898464077527026, bpar

Fold 1


INFO:revrand.slm:ELBO = -9425.116665541362, var = 0.24262295711093504, reg = 0.4121776859681468, bparams = [  3.50418394e-02   7.23497976e+00   1.60188513e+01   3.23744179e+00
   4.04134823e+01   1.30848467e+01   5.32931701e+00   4.00734265e-01
   6.94358098e+00   2.52074227e+00   4.66396626e+01   4.63858246e+00
   1.71277530e+01].
INFO:revrand.slm:ELBO = -2980.1112438306313, var = 1.1590130247265413, reg = 1.3574287334425805, bparams = [ 12.44763359  27.20859999   7.88619705   4.02898502   1.03687681
   5.82216514  52.92268359  23.41387929   7.68646568  13.53519126
   8.68927724  38.47835008   5.84139751].
INFO:revrand.slm:ELBO = -10237.706231104174, var = 0.33373424971853016, reg = 0.21167556338224736, bparams = [ 15.96575861  14.76131812   3.22542796  23.32740807  48.26593672
  13.13084561  17.12473403  16.37014351  41.20074159   0.27619267
   0.33558638  40.52839374  17.94398365].
INFO:revrand.slm:ELBO = -5752.409237488475, var = 0.8605263335941454, reg = 0.029103638715044372, bpar

Fold 2


INFO:revrand.slm:ELBO = -12796.720667410225, var = 0.22134688723763402, reg = 0.41372063838890644, bparams = [ 38.81643807  42.12346963  65.66779447   3.34224721  10.005692
   0.99174649  49.03635505   7.36376569  27.60624701  22.76385519
   3.49028287  37.0373912   20.77440126].
INFO:revrand.slm:ELBO = -6181.98262532504, var = 0.8820613892926031, reg = 0.03494877991076508, bparams = [   5.29580402    7.43341137   27.1071965    39.88644753   23.92480836
   14.15032124   61.35864108   27.26532421    9.5761381   115.68533611
   17.41960106   16.27331364   74.43293116].
INFO:revrand.slm:ELBO = -2944.3048993704797, var = 1.9198020576111403, reg = 0.14443572100663163, bparams = [  9.21485243   0.95970795  10.83571485   2.87483253   4.33626014
   8.56687749   4.21432509   0.58903109  16.08661263  54.49697002
  46.31234404  13.41993481  12.2466448 ].
INFO:revrand.slm:ELBO = -11185.367056695286, var = 0.34732879738892464, reg = 1.382438160080974, bparams = [ 13.23356     62.57366068  47.291040

Fold 3


INFO:revrand.slm:ELBO = -5930.265166388011, var = 0.7106563412257295, reg = 0.3743551373399284, bparams = [   7.49114754    3.4934305    16.46284939    0.79963835    6.84146182
   43.82136114    1.59937996    0.4576036     2.369129    114.30656714
   12.71903977   92.68177032   26.65004793].
INFO:revrand.slm:ELBO = -2451.4372126047665, var = 2.03646076109248, reg = 0.3568326576817861, bparams = [  11.54806102   20.39267164    2.90303021    1.87848696   23.70135271
    2.47353983    8.42183871   11.23157423  112.48961534  104.54461437
   13.26828743   27.6122855     7.44702428].
INFO:revrand.slm:ELBO = -2796.3766406648156, var = 1.788857669204362, reg = 0.8597232222333621, bparams = [  0.98807433  12.43888231   3.51306775  27.04302006   1.3095598
  10.18633107   9.90590051   5.68212327   0.95428366   5.55253865
   6.07006884  50.77930887   7.23066382].
INFO:revrand.slm:ELBO = -10580.126635419756, var = 0.37695535489107745, reg = 0.04922332568223432, bparams = [  7.97481026e+00   1.25672

Fold 4


INFO:revrand.slm:ELBO = -2517.5966619307296, var = 2.275063571642428, reg = 0.24305749158667078, bparams = [ 38.58986844  15.58425185  54.18159027   6.34193825   2.20914108
   6.46483851  10.36424349  23.21183556  36.12130618  18.02182115
  97.58195149   2.05728924  66.7246266 ].
INFO:revrand.slm:ELBO = -1930.5427919213116, var = 3.6155508432301313, reg = 0.30259862113176583, bparams = [ 40.9522391   59.60697751  35.78443548  51.17209197  62.75360675
  19.1939402    4.28638651   9.43245604   5.73079963  34.51498326
  29.18769495  49.72556527  20.70192593].
INFO:revrand.slm:ELBO = -3284.896210767693, var = 1.6056746129643247, reg = 0.2268660117879829, bparams = [ 30.83641285  12.03917971  62.79970151  14.43277283  40.98710984
  50.82218202   1.9721744   27.15229105  18.87867874  44.28289215
  13.74716906   3.72926324  21.49942712].
INFO:revrand.slm:ELBO = -5204.147587828982, var = 0.7463257825136054, reg = 0.5374545793297326, bparams = [  6.00353547e-02   2.15038602e+01   2.70034805e+01

Fold 5


## Report

In [65]:
# Print results
print_score('SLM', slm_score)
print_score('GLM', glm_score)
print_score('GP', gp_score)
print_score('RF', rf_score)
print_score('SVR', svr_score)

SLM:
	R2 = 0.8524 (0.0885),
	SMSE = 0.1476 (0.0885),
	MSLL = -1.0492 (0.2400)
GLM:
	R2 = 0.8340 (0.0491),
	SMSE = 0.1660 (0.0491),
	MSLL = -0.8763 (0.1242)
GP:
	R2 = 0.9064 (0.0134),
	SMSE = 0.0936 (0.0134),
	MSLL = -1.1821 (0.1633)
RF:
	R2 = 0.8467 (0.0709),
	SMSE = 0.1533 (0.0709),
	MSLL = -0.4241 (0.0529)
SVR:
	R2 = 0.6295 (0.0863),
	SMSE = 0.3705 (0.0863),
	MSLL = -0.3142 (0.0388)
