In [1]:
#! pip install statsmodels==0.13.2

In [2]:
import numpy as np
import pandas as pd
from scipy.linalg import lstsq
from scipy.sparse.linalg import lsmr
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d, griddata, LinearNDInterpolator, NearestNDInterpolator
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split as tts
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
import matplotlib.pyplot as plt
from matplotlib import pyplot

In [3]:
from statsmodels.nonparametric.kernel_regression import KernelReg

# reference for the Nadaraya-Watson regressor: https://bookdown.org/egarpor/PM-UC3M/npreg-kre.html

#model_KernReg = KernelReg(endog=dat_train[:,-1],exog=dat_train[:,:-1],var_type='ccc',ckertype='gaussian')

In [4]:
import lightgbm as lgb

ModuleNotFoundError: No module named 'lightgbm'

In [5]:
# import libraries for creating a neural network
# imports for creating a Neural Network
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.metrics import r2_score
from tensorflow.keras.optimizers import Adam, SGD, RMSprop # they recently updated Tensorflow
from keras.callbacks import EarlyStopping

In [6]:
# Tricubic Kernel
def Tricubic(x):
  if len(x.shape) == 1:
    x = x.reshape(-1,1)
  d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>1,0,70/81*(1-d**3)**3)

# Quartic Kernel
def Quartic(x):
  if len(x.shape) == 1:
    x = x.reshape(-1,1)
  d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>1,0,15/16*(1-d**2)**2)

# Epanechnikov Kernel
def Epanechnikov(x):
  if len(x.shape) == 1:
    x = x.reshape(-1,1)
  d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>1,0,3/4*(1-d**2)) 

In [7]:
#Defining the kernel local regression model

def lw_reg(X, y, xnew, kern, tau, intercept):
    # tau is called bandwidth K((x-x[i])/(2*tau))
    n = len(X) # the number of observations
    yest = np.zeros(n)

    if len(y.shape)==1: # here we make column vectors
      y = y.reshape(-1,1)

    if len(X.shape)==1:
      X = X.reshape(-1,1)
    
    if intercept:
      X1 = np.column_stack([np.ones((len(X),1)),X])
    else:
      X1 = X

    w = np.array([kern((X - X[i])/(2*tau)) for i in range(n)]) # here we compute n vectors of weights

    #Looping through all X-points
    for i in range(n):          
        W = np.diag(w[:,i])
        b = np.transpose(X1).dot(W).dot(y)
        A = np.transpose(X1).dot(W).dot(X1)
        #A = A + 0.001*np.eye(X1.shape[1]) # if we want L2 regularization
        #theta = linalg.solve(A, b) # A*theta = b
        beta, res, rnk, s = lstsq(A, b)
        yest[i] = np.dot(X1[i],beta)
    if X.shape[1]==1:
      f = interp1d(X.flatten(),yest,fill_value='extrapolate')
    else:
      f = LinearNDInterpolator(X, yest)
    output = f(xnew) # the output may have NaN's where the data points from xnew are outside the convex hull of X
    if sum(np.isnan(output))>0:
      g = NearestNDInterpolator(X,y.ravel()) 
      # output[np.isnan(output)] = g(X[np.isnan(output)])
      output[np.isnan(output)] = g(xnew[np.isnan(output)])
    return output

In [8]:
def boosted_lwr(X, y, xnew, kern, tau, intercept):
  # we need decision trees
  # for training the boosted method we use X and y
  Fx = lw_reg(X,y,X,kern,tau,intercept) # we need this for training the Decision Tree
  # Now train the Decision Tree on y_i - F(x_i)
  new_y = y - Fx
  #model = DecisionTreeRegressor(max_depth=2, random_state=123)
  model = RandomForestRegressor(n_estimators=100,max_depth=2)
  #model = model_xgb
  model.fit(X,new_y)
  output = model.predict(xnew) + lw_reg(X,y,xnew,kern,tau,intercept)
  return output 

In [9]:
import xgboost as xgb

In [11]:
# we design a Neural Network for regression
# We have to decide how many layers we want, how many neurons per layer and the type of activation functions
# Create a Neural Network model
model_nn = Sequential()
model_nn.add(Dense(128, activation="relu", input_dim=3))
model_nn.add(Dense(128, activation="relu"))

# Since the regression is performed, a Dense layer containing a single neuron with a linear activation function.
# Typically ReLu-based activation are used but since it is performed regression, it is needed a linear activation.
model_nn.add(Dense(1, activation="linear")) # we need this b/c we predict a continuous random variable

# Compile model: The model is initialized with the Adam optimizer and then it is compiled.
model_nn.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=1e-2)) # lr=1e-3, decay=1e-3 / 200)

# Patient early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=800)

In [14]:
# import the data
cars = pd.read_csv('Data/cars.csv')

In [15]:
X = cars[['ENG','CYL','WGT']].values
y = cars['MPG'].values

In [16]:
scale = StandardScaler()

In [17]:
dat = np.concatenate([X,y.reshape(-1,1)],axis=1)

In [18]:
dat[:,-1] # this is the y values

array([18.      , 15.      , 18.      , 16.      , 17.      , 15.      ,
       14.      , 14.      , 14.      , 15.      , 15.      , 14.      ,
       15.      , 14.      , 24.      , 22.      , 18.      , 21.      ,
       27.      , 26.      , 25.      , 24.      , 25.      , 26.      ,
       21.      , 10.      , 10.      , 11.      ,  9.      , 27.      ,
       28.      , 25.      , 19.      , 16.      , 17.      , 19.      ,
       18.      , 14.      , 14.      , 14.      , 14.      , 12.      ,
       13.      , 13.      , 18.      , 22.      , 19.      , 18.      ,
       23.      , 28.      , 30.      , 30.      , 31.      , 35.      ,
       27.      , 26.      , 24.      , 25.      , 23.      , 20.      ,
       21.      , 13.      , 14.      , 15.      , 14.      , 17.      ,
       11.      , 13.      , 12.      , 13.      , 19.      , 15.      ,
       13.      , 13.      , 14.      , 18.      , 22.      , 21.      ,
       26.      , 22.      , 28.      , 23.      , 

In [19]:
dat[:,:-1] # this is X values

array([[ 307.,    8., 3504.],
       [ 350.,    8., 3693.],
       [ 318.,    8., 3436.],
       ...,
       [ 135.,    4., 2295.],
       [ 120.,    4., 2625.],
       [ 119.,    4., 2720.]])

In [21]:
model_KernReg = KernelReg(endog=dat[:,-1],exog=dat[:,:-1],var_type='ccc',ckertype='gaussian')

In [22]:
yhat_sm, y_std = model_KernReg.fit(dat[:,:-1])

In [23]:
mse(y,yhat_sm)

15.333316197606733

In [24]:
# we want more nested cross-validations

mse_lwr = []
mse_blwr = []
mse_rf = []
mse_xgb = []
mse_nn = []
mse_NW = []
for i in [1234]:
  kf = KFold(n_splits=10,shuffle=True,random_state=i)
  # this is the Cross-Validation Loop
  for idxtrain, idxtest in kf.split(X):
    xtrain = X[idxtrain]
    ytrain = y[idxtrain]
    ytest = y[idxtest]
    xtest = X[idxtest]
    xtrain = scale.fit_transform(xtrain)
    xtest = scale.transform(xtest)
    dat_train = np.concatenate([xtrain,ytrain.reshape(-1,1)],axis=1)
    dat_test = np.concatenate([xtest,ytest.reshape(-1,1)],axis=1)
    yhat_lwr = lw_reg(xtrain,ytrain, xtest,Epanechnikov,tau=0.9,intercept=True)
    yhat_blwr = boosted_lwr(xtrain,ytrain, xtest,Epanechnikov,tau=0.9,intercept=True)
    model_rf = RandomForestRegressor(n_estimators=100,max_depth=3)
    model_rf.fit(xtrain,ytrain)
    yhat_rf = model_rf.predict(xtest)
    model_xgb = xgb.XGBRegressor(objective ='reg:squarederror',n_estimators=100,reg_lambda=20,alpha=1,gamma=10,max_depth=3)
    model_xgb.fit(xtrain,ytrain)
    yhat_xgb = model_xgb.predict(xtest)
    model_nn.fit(xtrain,ytrain,validation_split=0.2, epochs=500, batch_size=10, verbose=0, callbacks=[es])
    yhat_nn = model_nn.predict(xtest)
    # here is the application of the N-W regressor
    model_KernReg = KernelReg(endog=dat_train[:,-1],exog=dat_train[:,:-1],var_type='ccc',ckertype='gaussian')
    yhat_sm, yhat_std = model_KernReg.fit(dat_test[:,:-1])
    mse_lwr.append(mse(ytest,yhat_lwr))
    mse_blwr.append(mse(ytest,yhat_blwr))
    mse_rf.append(mse(ytest,yhat_rf))
    mse_xgb.append(mse(ytest,yhat_xgb))
    mse_nn.append(mse(ytest,yhat_nn))
    mse_NW.append(mse(ytest,yhat_sm))
print('The Cross-validated Mean Squared Error for LWR is : '+str(np.mean(mse_lwr)))
print('The Cross-validated Mean Squared Error for BLWR is : '+str(np.mean(mse_blwr)))
print('The Cross-validated Mean Squared Error for RF is : '+str(np.mean(mse_rf)))
print('The Cross-validated Mean Squared Error for XGB is : '+str(np.mean(mse_xgb)))
print('The Cross-validated Mean Squared Error for NN is : '+str(np.mean(mse_nn)))
print('The Cross-validated Mean Squared Error for Nadarya-Watson Regressor is : '+str(np.mean(mse_NW)))

KeyboardInterrupt: 

In [None]:
# apply this setup to the Boston Housing data set and also the Concrete Strength

### Repeated Boosting
Update the algorithms we have for repeated boosting.
Compare with more boosting algorithms such as LightGBM (and, time allows, Catboost)