Boston Housing Dataset Test
===========================

In this notebook we test revrand's ARD basis functions on the Boston housing dataset.

In [60]:
from functools import partialmethod

import numpy as np
from scipy.stats import gamma

from sklearn.datasets import load_boston, load_diabetes
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, RBF, DotProduct
from sklearn.svm import SVR

from revrand import StandardLinearModel, GeneralizedLinearModel
import revrand.basis_functions as bf
from revrand.likelihoods import Gaussian
from revrand.metrics import smse, msll
from revrand.btypes import Parameter, Positive, Bound
from revrand.optimize import Adam

# Log output to the terminal attached to this notebook
import logging
#logger = logging.getLogger()
#logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)

## Settings

In [61]:
data = load_boston()
#data = load_diabetes()

folds = 5
random_state = 100

# Revrand SLM and GLM
nbases = 400
lenscale = gamma(1., scale=20)
regularizer = gamma(1., scale=10)

# Revrand GLM
nbases_g = 100
maxiter = 40000
nsamples = 20
updater = Adam(epsilon=1e-8)


# GP
length_scales = 1.
n_restarts = 1

# Random Forest
n_estimators = 40

# SVR
svr_params = {
    'svr__gamma': np.logspace(-1, 2, 20),
    'svr__epsilon': np.logspace(-5, 0, 10)
}

## Load data

In [62]:
# Load the data
X = data.data
y = data.target - data.target.mean()
N, D = X.shape

## Construct bases and kernels

In [63]:
# Construct basis functions
base_slm = bf.RandomRBF(Xdim=D,
                        nbases=nbases,
                        lenscale=Parameter(lenscale, Positive(), shape=(D,)),
                        random_state=random_state,
                        regularizer=Parameter(regularizer, Positive())
                        ) + bf.LinearBasis(onescol=True,
                                           regularizer=Parameter(regularizer, Positive()) 
                                           )

base_glm = bf.RandomRBF(Xdim=D,
                        nbases=nbases_g,
                        lenscale=Parameter(lenscale, Positive(), shape=(D,)),
                        random_state=random_state,
                        regularizer=Parameter(regularizer, Positive())
                        ) + bf.LinearBasis(onescol=True,
                                           regularizer=Parameter(regularizer, Positive()) 
                                           )


# Construct Kernel
kern = 1**2 * RBF(length_scale=length_scales * np.ones(D),
           length_scale_bounds=(1e-2, 1e3)
          ) + 1**2 * DotProduct() + WhiteKernel()

## Construct Estimators

In [64]:
# Revrand
class SLM(StandardLinearModel):
    def predict(self, X):
        return super().predict_moments(X)
    
slm = make_pipeline(StandardScaler(), SLM(base_slm, random_state=random_state))

class GLM(GeneralizedLinearModel):
    def predict(self, X):
        Ey, Vy = super().predict_moments(X)
        Vy += self.like_hypers_
        return Ey, Vy
    
glm = make_pipeline(StandardScaler(), GLM(Gaussian(), base_glm, random_state=random_state,
                                          nsamples=nsamples, maxiter=maxiter, updater=updater))

# Gaussian Process
class GP(GaussianProcessRegressor):
    def predict(self, X):
        return super().predict(X, return_std=True)

gp = make_pipeline(StandardScaler(), GP(kernel=kern, n_restarts_optimizer=n_restarts, random_state=random_state))

# RandomForest
rf = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=n_estimators, random_state=random_state))

# SVR
svr = GridSearchCV(make_pipeline(StandardScaler(), SVR()), svr_params, n_jobs=-1)

## Validation

In [65]:
# Convenience functions
def score(y_true, y_pred, y_var, y_train, scores):
    sc_smse = smse(y_true, y_pred)
    sc_msll = msll(y_true, y_pred, y_var, y_train)
    sc_r2 = r2_score(y_true, y_pred)
    
    scores.append((sc_smse, sc_msll, sc_r2))
    
    
def print_score(alg_name, scores):
    
    scores = np.array(scores)
    smse = scores[:, 0]
    msll = scores[:, 1]
    r2 = scores[:, 2]
    
    print("{}:\n\tR2 = {:.4f} ({:.4f}),\n\tSMSE = {:.4f} ({:.4f}),\n\tMSLL = {:.4f} ({:.4f})"
          .format(alg_name,
                  np.mean(r2), np.std(r2),
                  np.mean(smse), np.std(smse),
                  np.mean(msll), np.std(msll)
                 )
         )

In [66]:
# Cross val
slm_score = []
glm_score = []
gp_score = []
rf_score = []
svr_score = []

foldgen = KFold(n_splits=folds, shuffle=True, random_state=random_state)
for i, (tr_ind, ts_ind) in enumerate(foldgen.split(X)):
    
    # Revrand
    slm.fit(X[tr_ind], y[tr_ind])
    Ey, Vy = slm.predict(X[ts_ind])
    score(y[ts_ind], Ey, Vy, y[tr_ind], slm_score)
    
    glm.fit(X[tr_ind], y[tr_ind])
    Ey, Vy = glm.predict(X[ts_ind])
    score(y[ts_ind], Ey, Vy, y[tr_ind], glm_score)
    
    # GP
    gp.fit(X[tr_ind], y[tr_ind])
    Ey, Sy = gp.predict(X[ts_ind])
    score(y[ts_ind], Ey, Sy**2, y[tr_ind], gp_score)
    
    # RF
    rf.fit(X[tr_ind], y[tr_ind])
    Ey = rf.predict(X[ts_ind])
    score(y[ts_ind], Ey, y[tr_ind].var(), y[tr_ind], rf_score)
    
    # SVR
    svr.fit(X[tr_ind], y[tr_ind])
    Ey = svr.predict(X[ts_ind])
    score(y[ts_ind], Ey, y[tr_ind].var(), y[tr_ind], svr_score)
    
    print("Fold {}".format(i + 1))

INFO:revrand.optimize.decorators:Evaluating random starts...
INFO:revrand.slm:ELBO = -2061.889855490841, var = 1.6696293576221093, reg = [1.8867088647325465, 16.940417284305244], bparams = [  6.40613699  11.30226385  56.27815668  34.0364791    8.19283487
   3.85739072   9.33081811   0.11409497   5.8184491   31.75964564
   0.30745051  18.26806693  18.51695202].
INFO:revrand.slm:ELBO = -19327.81775835102, var = 0.11109658595678165, reg = [4.8117531267447005, 0.37157940987670507], bparams = [ 44.22046832  79.18319062   1.23627383  44.24500809  17.2030053
  27.13313369  19.89499045  17.43792775   0.41301803   4.71511953
  15.73531049  29.31672538   5.77218949].
INFO:revrand.slm:ELBO = -4056.999438805942, var = 0.336726235445794, reg = [19.132160871615874, 36.891392321168212], bparams = [ 43.2309651    8.91036809  18.26884315   8.76376262   8.31607381
   3.92226832   5.42815006   0.91799484  14.08138943   9.44019133
  17.96928155  19.88190385   3.07702183].
INFO:revrand.slm:ELBO = -1574.781

Fold 1


INFO:revrand.slm:ELBO = -4120.34430476915, var = 0.8874465172043137, reg = [9.335474563450795, 13.088512367662485], bparams = [  7.30926335  28.63236302  54.88361912  10.12382476  37.47440647
  20.97149939  16.55711166  31.7124309    7.18005575   6.45715967
  16.19997667   1.32427608  13.72392777].
INFO:revrand.slm:ELBO = -5306.762580564469, var = 0.4734777656017338, reg = [14.167644189530201, 3.5542594265732532], bparams = [  0.94272594   8.92664074  12.79081992  16.64303858  46.54783013
   9.92768655  31.76606934  37.25935997  27.29654458   8.08529726
   6.10232037  26.68663729  25.96955307].
INFO:revrand.slm:ELBO = -5143.068749895418, var = 0.4969032104897519, reg = [7.916031525349152, 6.3092030592850792], bparams = [  9.5692369   14.03594382  16.82402953  22.86503606  11.53924198
   4.52293368  15.2510674    7.27882808  12.39813775  22.82753235
  17.1859708   21.0860587   49.65344093].
INFO:revrand.slm:ELBO = -4512.847559919318, var = 0.23053470207756216, reg = [10.620515466868813,

Fold 2


INFO:revrand.slm:ELBO = -2191.6583024946126, var = 2.727882771346696, reg = [11.543245414434711, 17.203419063382341], bparams = [ 12.12203235  24.50310723  22.47461895  23.96655288  34.32039954
  55.16014778   5.47323438  23.96860883   9.78552863   2.84986909
   2.31549047  15.69857929  25.82052197].
INFO:revrand.slm:ELBO = -8251.827201889073, var = 0.2220941318822922, reg = [5.9357054631978521, 13.977073846488073], bparams = [ 49.62357977  25.58499005  16.83718876   3.10733827   1.14051725
   8.14332396   5.05131395  26.74684244  25.94448119  12.9715019
  52.19599182  26.92025419   1.27620872].
INFO:revrand.slm:ELBO = -4364.878173415001, var = 0.5937603872257542, reg = [2.0443357456862095, 15.014126028129716], bparams = [  2.65045761  15.52951628  19.89381871  20.70244451  27.08147159
   0.82211092  25.38414767   8.69939375   1.29506595   5.69944036
  18.92133686  12.75080535   6.14030303].
INFO:revrand.slm:ELBO = -3129.1340403274653, var = 1.2104080067055005, reg = [0.776461488736621

Fold 3


INFO:revrand.slm:ELBO = -13434.119818394674, var = 0.2725833877130137, reg = [2.6399133261283012, 1.8948138993198071], bparams = [  8.68916526  10.90902181  38.68281726  27.04330733  29.75199538
  25.34593977   1.35689275  33.39136638   0.57703609  68.60449635
   9.36316021  35.09841878  37.09750902].
INFO:revrand.slm:ELBO = -3858.544481424893, var = 1.0368120533649543, reg = [6.6443965326813768, 5.4449098912780238], bparams = [ 21.23816823  53.28929687   9.6003747   14.8652779   10.06433465
  20.76320757   5.12487428  26.90948038  13.4303451    1.38640734
  28.27231374  35.49103716  14.04994954].
INFO:revrand.slm:ELBO = -78458.86695327511, var = 0.03211343262920676, reg = [0.70950925378239038, 11.973630676370226], bparams = [  4.40907685   2.52402882   2.81271817  16.27468012  10.38200448
   4.07736802   2.37918345   5.91939991  14.16637349  19.87531641
  46.52561614   2.94848093  26.19146792].
INFO:revrand.slm:ELBO = -5793.070404713948, var = 0.6005721132949257, reg = [14.99943538010

Fold 4


INFO:revrand.slm:ELBO = -2051.891794408337, var = 2.8005857804069634, reg = [2.7975877458625558, 6.0241560020991756], bparams = [  9.745329    70.85842725   0.92931227  11.33547834   4.77258214
  30.40186871   6.63537503   3.23236896  30.41316216   6.8346375
  19.98004912  17.87476712  18.95699391].
INFO:revrand.slm:ELBO = -10640.737276168013, var = 0.2725591027777337, reg = [2.7191171163071104, 0.15510251494085753], bparams = [ 10.09409133  14.39434026  19.57317597   7.47856456  22.74590972
  36.26795683  29.05147989   7.01180506  38.15878815  20.76814552
  30.4885489   10.58702156   1.83153054].
INFO:revrand.slm:ELBO = -18280.50845918745, var = 0.14460336685101266, reg = [10.986010247561016, 5.0440454228675513], bparams = [  48.80489169   12.44821091  168.69254771    8.53983582   10.79871644
   23.32846578   14.21435114   20.5172087    19.64835466   27.85667407
   35.95227078    9.4943008     5.44905138].
INFO:revrand.slm:ELBO = -10285.240582998385, var = 0.25238216926765294, reg = [

Fold 5


## Report

In [67]:
# Print results
print_score('SLM', slm_score)
print_score('GLM', glm_score)
print_score('GP', gp_score)
print_score('RF', rf_score)
print_score('SVR', svr_score)

SLM:
	R2 = 0.9018 (0.0134),
	SMSE = 0.0982 (0.0134),
	MSLL = -1.1504 (0.1191)
GLM:
	R2 = 0.8411 (0.0491),
	SMSE = 0.1589 (0.0491),
	MSLL = -0.9209 (0.1530)
GP:
	R2 = 0.9027 (0.0137),
	SMSE = 0.0973 (0.0137),
	MSLL = -1.1792 (0.1581)
RF:
	R2 = 0.8467 (0.0709),
	SMSE = 0.1533 (0.0709),
	MSLL = -0.4241 (0.0529)
SVR:
	R2 = 0.6295 (0.0863),
	SMSE = 0.3705 (0.0863),
	MSLL = -0.3142 (0.0388)
