Boston Housing Dataset Test
===========================

In this notebook we test revrand's ARD basis functions on the Boston housing dataset.

In [10]:
from functools import partialmethod

import numpy as np
from scipy.stats import gamma

from sklearn.datasets import load_boston, load_diabetes
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, RBF, DotProduct
from sklearn.svm import SVR

from revrand import StandardLinearModel, GeneralizedLinearModel
import revrand.basis_functions as bf
from revrand.likelihoods import Gaussian
from revrand.metrics import smse, msll
from revrand.btypes import Parameter, Positive, Bound
from revrand.optimize import Adam

# Log output to the terminal attached to this notebook
import logging
#logger = logging.getLogger()
#logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)

## Settings

In [11]:
data = load_boston()
#data = load_diabetes()

folds = 5
random_state = 100

# Revrand
nbases = 100
lenscale = gamma(1., scale=20)
maxiter = 5000
nsamples = 50
updater = Adam(epsilon=1e-8)

# GP
length_scales = 1.
n_restarts = 1

# Random Forest
n_estimators = 40

# SVR
svr_params = {
    'svr__gamma': np.logspace(-1, 2, 20),
    'svr__epsilon': np.logspace(-5, 0, 10)
}

## Load data

In [12]:
# Load the data
X = data.data
y = data.target - data.target.mean()
N, D = X.shape

## Construct bases and kernels

In [13]:
# Construct basis functions
base = bf.RandomRBF(Xdim=D,
                    nbases=nbases,
                    lenscale_init=Parameter(lenscale, Positive(), shape=(D,)),
                    random_state=random_state
                    ) + bf.LinearBasis(onescol=True)

# Construct Kernel
kern = 1**2 * RBF(length_scale=length_scales * np.ones(D),
           length_scale_bounds=(1e-2, 1e3)
          ) + DotProduct() + WhiteKernel()

## Construct Estimators

In [14]:
# Revrand
class SLM(StandardLinearModel):
    def predict(self, X):
        return super().predict_moments(X)
    
slm = make_pipeline(StandardScaler(), SLM(base, random_state=random_state))

class GLM(GeneralizedLinearModel):
    def predict(self, X):
        Ey, Vy = super().predict_moments(X)
        Vy += self.like_hypers_
        return Ey, Vy
    
glm = make_pipeline(StandardScaler(), GLM(Gaussian(), base, random_state=random_state,
                                          nsamples=nsamples, maxiter=maxiter, updater=updater))

# Gaussian Process
class GP(GaussianProcessRegressor):
    def predict(self, X):
        return super().predict(X, return_std=True)

gp = make_pipeline(StandardScaler(), GP(kernel=kern, n_restarts_optimizer=n_restarts, random_state=random_state))

# RandomForest
rf = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=n_estimators, random_state=random_state))

# SVR
svr = GridSearchCV(make_pipeline(StandardScaler(), SVR()), svr_params, n_jobs=-1)

## Validation

In [15]:
# Convenience functions
def score(y_true, y_pred, y_var, y_train, scores):
    sc_smse = smse(y_true, y_pred)
    sc_msll = msll(y_true, y_pred, y_var, y_train)
    sc_r2 = r2_score(y_true, y_pred)
    
    scores.append((sc_smse, sc_msll, sc_r2))
    
    
def print_score(alg_name, scores):
    
    scores = np.array(scores)
    smse = scores[:, 0]
    msll = scores[:, 1]
    r2 = scores[:, 2]
    
    print("{}:\n\tR2 = {:.4f} ({:.4f}),\n\tSMSE = {:.4f} ({:.4f}),\n\tMSLL = {:.4f} ({:.4f})"
          .format(alg_name,
                  np.mean(r2), np.std(r2),
                  np.mean(smse), np.std(smse),
                  np.mean(msll), np.std(msll)
                 )
         )

In [16]:
# Cross val
slm_score = []
glm_score = []
gp_score = []
rf_score = []
svr_score = []

foldgen = KFold(n_splits=folds, shuffle=True, random_state=random_state)
for i, (tr_ind, ts_ind) in enumerate(foldgen.split(X)):
    
    # Revrand
    slm.fit(X[tr_ind], y[tr_ind])
    Ey, Vy = slm.predict(X[ts_ind])
    score(y[ts_ind], Ey, Vy, y[tr_ind], slm_score)
    
    glm.fit(X[tr_ind], y[tr_ind])
    Ey, Vy = glm.predict(X[ts_ind])
    score(y[ts_ind], Ey, Vy, y[tr_ind], glm_score)
    
    # GP
    gp.fit(X[tr_ind], y[tr_ind])
    Ey, Sy = gp.predict(X[ts_ind])
    score(y[ts_ind], Ey, Sy**2, y[tr_ind], gp_score)
    
    # RF
    rf.fit(X[tr_ind], y[tr_ind])
    Ey = rf.predict(X[ts_ind])
    score(y[ts_ind], Ey, y[tr_ind].var(), y[tr_ind], rf_score)
    
    # SVR
    svr.fit(X[tr_ind], y[tr_ind])
    Ey = svr.predict(X[ts_ind])
    score(y[ts_ind], Ey, y[tr_ind].var(), y[tr_ind], svr_score)
    
    print("Fold {}".format(i + 1))

INFO:revrand.optimize.decorators:Evaluating random starts...
INFO:revrand.slm:ELBO = -1515.2293114095332, var = 3.845476409979971, reg = 1.6696293576221093, bparams = [  3.77341773  33.88083457   6.40613699  11.30226385  56.27815668
  34.0364791    8.19283487   3.85739072   9.33081811   0.11409497
   5.8184491   31.75964564   0.30745051].
INFO:revrand.slm:ELBO = -3082.676313823451, var = 0.9134033467068555, reg = 0.9258476012266578, bparams = [  2.22193172   9.62350625   0.74315882  44.22046832  79.18319062
   1.23627383  44.24500809  17.2030053   27.13313369  19.89499045
  17.43792775   0.41301803   4.71511953].
INFO:revrand.slm:ELBO = -4963.293393616644, var = 0.7867655243193307, reg = 1.4658362690057338, bparams = [  5.77218949   6.73452471  38.26432174  73.78278464  43.2309651
   8.91036809  18.26884315   8.76376262   8.31607381   3.92226832
   5.42815006   0.91799484  14.08138943].
INFO:revrand.slm:ELBO = -8235.350722562962, var = 0.47200956666942107, reg = 0.898464077527026, bpar

Fold 1


INFO:revrand.slm:ELBO = -5754.659673984091, var = 0.8605263335941454, reg = 0.029103638715044372, bparams = [ 57.78563005  16.792824    43.53237655  28.0241825   20.32208072
  84.47447012   4.23746552   2.72553798  22.69428256  19.90922355
  16.39912532  26.50120264  17.69721534].
INFO:revrand.slm:ELBO = -4483.164088723457, var = 0.4386495881784375, reg = 3.8061861727313726, bparams = [  9.10646964  50.06941595  15.7677411    8.66757031  40.91701272
   2.78134436  13.02235287   9.57140513  43.590697     1.26585758
   9.15268507  37.56589985  13.70730943].
INFO:revrand.slm:ELBO = -46467.85014676287, var = 0.06566172459903148, reg = 0.09747259933421927, bparams = [ 23.57536391  15.70710722  17.69103964  11.77903194  34.24634074
   7.82290488  15.11600616   3.85208772  25.0787921   16.13770626
   1.67453577   2.62876744   9.65840937].
INFO:revrand.slm:ELBO = -8134.959029013735, var = 0.17143123625625808, reg = 1.4978495661941742, bparams = [  0.53367028   6.45516809   9.10398265  28.24013

Fold 2


INFO:revrand.slm:ELBO = -10769.840062915502, var = 0.34732879738892464, reg = 1.382438160080974, bparams = [ 13.23356     62.57366068  47.29104081   1.99902665  22.1124954
  32.35718309  20.10139362  42.93322468  19.1200411    0.90713719
  27.39794224  44.68335723   7.86735381].
INFO:revrand.slm:ELBO = -2916.978071316487, var = 1.135330382797292, reg = 2.170497093532917, bparams = [  44.36182712   16.11685678    7.81234247    4.50792636   16.8001875
   12.00723087    0.30773993    0.36431812  131.37279882   12.11778795
   46.0805418    33.72768373    2.26378903].
INFO:revrand.slm:ELBO = -2311.3274623476327, var = 2.5514037147807587, reg = 0.5425230511317778, bparams = [ 16.50568803   7.07778872   2.23833591  16.97337477  35.52826687
  13.61864221  22.94379539  15.27273626  10.73340318   5.83202095
   6.70462434   3.60352498   4.11766457].
INFO:revrand.slm:ELBO = -17911.120420340147, var = 0.2570031550665879, reg = 0.035788476448922725, bparams = [  6.22616579   4.65750744  21.06543506 

Fold 3


INFO:revrand.slm:ELBO = -10740.509397983376, var = 0.37695535489107745, reg = 0.04922332568223432, bparams = [  7.97481026e+00   1.25672464e+01   1.09387362e+00   2.97077490e+01
   3.44018885e-04   8.04896107e+00   1.02179822e+00   1.26245132e+00
   2.73758234e+01   2.75190650e+00   3.12235271e+01   1.06767389e+01
   3.91941149e+01].
INFO:revrand.slm:ELBO = -11376.91546741741, var = 0.3604618900358043, reg = 0.05027012467392113, bparams = [  3.33826324   5.31374327  22.9198528    4.60967933  42.78111327
  12.56585973  46.01176333  44.86807686   1.20781736  13.38075601
  51.87961313   9.32827241   1.19631916].
INFO:revrand.slm:ELBO = -9147.018819268156, var = 0.44785860442728104, reg = 1.0408041815784952, bparams = [ 16.37702784   9.78778374  20.51490409  42.50407249  35.28629433
   9.81878011  12.74278784   8.00247568  20.22019973  18.15704049
  43.90631736   3.46456727   7.96320164].
INFO:revrand.slm:ELBO = -8789.245501511932, var = 0.35125972197039856, reg = 5.433406193592758, bparam

Fold 4


INFO:revrand.slm:ELBO = -3287.884484243933, var = 1.6056746129643247, reg = 0.2268660117879829, bparams = [ 30.83641285  12.03917971  62.79970151  14.43277283  40.98710984
  50.82218202   1.9721744   27.15229105  18.87867874  44.28289215
  13.74716906   3.72926324  21.49942712].
INFO:revrand.slm:ELBO = -5202.7893189204415, var = 0.7463257825136054, reg = 0.5374545793297326, bparams = [  6.00353547e-02   2.15038602e+01   2.70034805e+01   7.25196127e+00
   6.18063394e+00   9.38681817e+00   2.27011180e+01   3.40378743e+01
   3.63337835e+01   8.82084070e+00   5.69226815e+01   6.01188910e+01
   5.42982045e+01].
INFO:revrand.slm:ELBO = -2712.6093844450606, var = 2.055599884490895, reg = 0.23055263420840988, bparams = [  0.37780523  46.45976342  30.85961446  35.24155624   7.3142982
  18.83489626  25.99632203  65.20641197  25.04202897  13.64399696
  49.10935478  10.22995352   8.74216388].
INFO:revrand.slm:ELBO = -23332.047820481603, var = 0.1785243068447388, reg = 0.06354932262653648, bparams 

Fold 5


## Report

In [17]:
# Print results
print_score('SLM', slm_score)
print_score('GLM', glm_score)
print_score('GP', gp_score)
print_score('RF', rf_score)
print_score('SVR', svr_score)

SLM:
	R2 = 0.8342 (0.0713),
	SMSE = 0.1658 (0.0713),
	MSLL = -0.9138 (0.2762)
GLM:
	R2 = 0.8431 (0.0401),
	SMSE = 0.1569 (0.0401),
	MSLL = -0.9122 (0.1062)
GP:
	R2 = 0.9064 (0.0134),
	SMSE = 0.0936 (0.0134),
	MSLL = -1.1821 (0.1633)
RF:
	R2 = 0.8467 (0.0709),
	SMSE = 0.1533 (0.0709),
	MSLL = -0.4241 (0.0529)
SVR:
	R2 = 0.6295 (0.0863),
	SMSE = 0.3705 (0.0863),
	MSLL = -0.3142 (0.0388)
