In [15]:
import numpy as np
import pandas as pd
from scipy.linalg import lstsq
from scipy.sparse.linalg import lsmr
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d, griddata, LinearNDInterpolator, NearestNDInterpolator
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split as tts
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
import matplotlib.pyplot as plt
from matplotlib import pyplot

# import libraries for creating a neural network
# imports for creating a Neural Network
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.metrics import r2_score
from tensorflow.keras.optimizers import Adam, SGD, RMSprop # they recently updated Tensorflow
from keras.callbacks import EarlyStopping

#XGB Boosting
import xgboost as xgb 

In [16]:
# Tricubic Kernel
def Tricubic(x):
  if len(x.shape) == 1:
    x = x.reshape(-1,1)
  d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>1,0,70/81*(1-d**3)**3)

# Quartic Kernel
def Quartic(x):
  if len(x.shape) == 1:
    x = x.reshape(-1,1)
  d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>1,0,15/16*(1-d**2)**2)

# Epanechnikov Kernel
def Epanechnikov(x):
  if len(x.shape) == 1:
    x = x.reshape(-1,1)
  d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>1,0,3/4*(1-d**2)) 

In [17]:
#Defining the kernel local regression model

def lw_reg(X, y, xnew, kern, tau, intercept):
    # tau is called bandwidth K((x-x[i])/(2*tau))
    n = len(X) # the number of observations
    yest = np.zeros(n)

    if len(y.shape)==1: # here we make column vectors
      y = y.reshape(-1,1)

    if len(X.shape)==1:
      X = X.reshape(-1,1)
    
    if intercept:
      X1 = np.column_stack([np.ones((len(X),1)),X])
    else:
      X1 = X

    w = np.array([kern((X - X[i])/(2*tau)) for i in range(n)]) # here we compute n vectors of weights

    #Looping through all X-points
    for i in range(n):          
        W = np.diag(w[:,i])
        b = np.transpose(X1).dot(W).dot(y)
        A = np.transpose(X1).dot(W).dot(X1)
        #A = A + 0.001*np.eye(X1.shape[1]) # if we want L2 regularization
        #theta = linalg.solve(A, b) # A*theta = b
        beta, res, rnk, s = lstsq(A, b)
        yest[i] = np.dot(X1[i],beta)
    if X.shape[1]==1:
      f = interp1d(X.flatten(),yest,fill_value='extrapolate')
    else:
      f = LinearNDInterpolator(X, yest)
    output = f(xnew) # the output may have NaN's where the data points from xnew are outside the convex hull of X
    if sum(np.isnan(output))>0:
      g = NearestNDInterpolator(X,y.ravel()) 
      # output[np.isnan(output)] = g(X[np.isnan(output)])
      output[np.isnan(output)] = g(xnew[np.isnan(output)])
    return output

#Boosted LWR with decision tree or random forest boosting
def boosted_lwr(X, y, xnew, kern, tau, intercept): 
  # we need decision trees
  # for training the boosted method we use X and y
  Fx = lw_reg(X,y,X,kern,tau,intercept) # we need this for training the Decision Tree
  # Now train the Decision Tree on y_i - F(x_i)
  new_y = y - Fx
  #model = DecisionTreeRegressor(max_depth=2, random_state=123)
  model = RandomForestRegressor(n_estimators=100,max_depth=2)
  #model = model_xgb
  model.fit(X,new_y)
  output = model.predict(xnew) + lw_reg(X,y,xnew,kern,tau,intercept)
  return output    

### Now we design a Nerual Network for regression

How many layers? How many neurons in each layer? All dense layers (MLP)? What other types? Which activation functions do we want?

In [27]:
cars = pd.read_csv('Data/cars.csv')

X = cars[['ENG','CYL','WGT']].values
y = cars['MPG'].values

kf = KFold(n_splits=10,shuffle=True,random_state=1234)
scale = StandardScaler()

data = np.concatenate([X,y.reshape(-1,1)], axis=1)

In [19]:
model_nn = Sequential() #Making a tensorflow sequential network

#Arbitrary number of layers, try 5? Lets do 128-neuron dense layers with relu activation layers
model_nn.add(Dense(128, activation="relu", input_dim=3))
model_nn.add(Dropout(0.1))
model_nn.add(Dense(64, activation="relu"))
model_nn.add(Dropout(0.1))


#Doing a linear regression, so linear activation in the last layer (and only one output dim 1)
#It's solving a regression problem, so it's predicting a continuous random variable, meaning the activation of the last
#neuron is the continuous random variable to be predicted.
model_nn.add(Dense(1, activation="linear"))

#Optimizer is doing gradient descent on the weights and biases and updating them via backpropogation
model_nn.compile(loss='mean_squared_error', optimizer=Adam(learning_rate= 0.001, decay = 0.0001))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=600) #Stops the training loop if it's taking too long(gets stuck)

In [20]:
#The XGB model definition
model_xgb = xgb.XGBRegressor(objective ='reg:squarederror',n_estimators=100,reg_lambda=20,alpha=1,gamma=10,max_depth=3)

In [21]:
# we want more nested cross-validations
#This block will take an hour of more (long time)
#Neural networks are usually no match for boosted algorithms when it comes to regression on a continuos variable

mse_lwr = []
mse_blwr = []
mse_rf = []
mse_xgb = []
mse_nn = []

for i in range(1):
  print('KFold Validation Number: ' + str(i+1) + '\n')
  kf = KFold(n_splits=10,shuffle=True,random_state=i)
  # this is the Cross-Validation Loop
  j = 0
  for idxtrain, idxtest in kf.split(X):
    j += 1
    #Split the train and test data
    xtrain = X[idxtrain]
    ytrain = y[idxtrain]
    ytest = y[idxtest]
    xtest = X[idxtest]
    xtrain = scale.fit_transform(xtrain)
    xtest = scale.transform(xtest)

    #Train and predict for LWR and boosted LWR
    yhat_lwr = lw_reg(xtrain,ytrain, xtest,Tricubic,tau=1.2,intercept=True)
    yhat_blwr = boosted_lwr(xtrain,ytrain, xtest,Tricubic,tau=1.2,intercept=True)

    #Train and predict with random forest
    model_rf = RandomForestRegressor(n_estimators=100,max_depth=3)
    model_rf.fit(xtrain,ytrain)
    yhat_rf = model_rf.predict(xtest)

    #Train and predict for XGB
    model_xgb.fit(xtrain,ytrain)
    yhat_xgb = model_xgb.predict(xtest)

    #Train and predict for neural network
    #Batch size devides the train data into batches, trains on each batch then updates the gradients until it gets through the whole data
    #Then moves onto the next epoch
    print('Split Number: ' + str(j))
    model_nn.fit(xtrain,ytrain,validation_split=0.3, epochs=100, batch_size=20, verbose=0, callbacks=[es])
    yhat_nn = model_nn.predict(xtest)

    #Append each model's MSE
    mse_lwr.append(mse(ytest,yhat_lwr))
    mse_blwr.append(mse(ytest,yhat_blwr))
    mse_rf.append(mse(ytest,yhat_rf))
    mse_xgb.append(mse(ytest,yhat_xgb))
    mse_nn.append(mse(ytest,yhat_nn))

print('\n The Cross-validated Mean Squared Error for LWR is : '+str(np.mean(mse_lwr)))
print('The Cross-validated Mean Squared Error for BLWR is : '+str(np.mean(mse_blwr)))
print('The Cross-validated Mean Squared Error for RF is : '+str(np.mean(mse_rf)))
print('The Cross-validated Mean Squared Error for XGB is : '+str(np.mean(mse_xgb)))
print('The Cross-validated Mean Squared Error for NN is : '+str(np.mean(mse_nn)))

KFold Validation Number: 1

Split Number: 1
Split Number: 2
Split Number: 3
Split Number: 4
Split Number: 5
Split Number: 6
Split Number: 7
Split Number: 8
Split Number: 9
Split Number: 10

 The Cross-validated Mean Squared Error for LWR is : 16.901595996974294
The Cross-validated Mean Squared Error for BLWR is : 16.64113938021411
The Cross-validated Mean Squared Error for RF is : 16.77570050694267
The Cross-validated Mean Squared Error for XGB is : 16.25007368020466
The Cross-validated Mean Squared Error for NN is : 21.709172153294126


In [22]:
#Next apply this to boston housing dataset and concrete strength

### Feb 23rd -- Nadaraya-Watson Estimator

Generally, we provide an estimation of Y based on X being a specific x, written $m(x) = E[Y | X=x]$

For Nadaraya-Watson estimators, we use $m(x) = E[Y|X=x] = \int y f_{Y|X=x}(y)dy = \frac{\int y f(x,y)dy}{f_{X}(x)}$

So basically, the regression function m(x) can be estimated by joint probablity density function of X and Y, called $f$, and "the marginal" $f_x$

This becomes:

$\hat f(x,y;h) = \frac{1}{n} \sum_{i=1}^n K_{h_1}(x-X_i) K_{h_2}(y-Y_i)$ where $K_{h_1}, K_{h_2}$ are the bandwiths in X and Y (like tau for loess)

In [33]:
from statsmodels.nonparametric.kernel_regression import KernelReg
model_KernReg = KernelReg(endog = data[:,-1], exog = data[:,:-1], var_type='ccc', ckertype='gaussian')

In [36]:
yhat, ystd = model_KernReg.fit(data[:,:-1]) #predictions
mse_kern = mse(y,yhat)
print(mse_kern)

15.333316197606733


In [55]:
# we want more nested cross-validations
#This block will take an hour of more (long time)
#Neural networks are usually no match for boosted algorithms when it comes to regression on a continuos variable

mse_lwr = []
mse_blwr = []
mse_rf = []
mse_xgb = []
mse_nn = []
mse_kern = []

for i in range(1):
  print('KFold Validation Number: ' + str(i+1) + '\n')
  kf = KFold(n_splits=2,shuffle=True,random_state=i)
  # this is the Cross-Validation Loop
  j = 0
  for idxtrain, idxtest in kf.split(X):
    j += 1
    #Split the train and test data
    xtrain = X[idxtrain]
    ytrain = y[idxtrain]
    ytest = y[idxtest]
    xtest = X[idxtest]
    xtrain = scale.fit_transform(xtrain)
    xtest = scale.transform(xtest)

    #Train and predict for LWR and boosted LWR
    yhat_lwr = lw_reg(xtrain,ytrain, xtest,Tricubic,tau=1.2,intercept=True)
    yhat_blwr = boosted_lwr(xtrain,ytrain, xtest,Tricubic,tau=1.2,intercept=True)

    #Train and predict with random forest
    model_rf = RandomForestRegressor(n_estimators=100,max_depth=3)
    model_rf.fit(xtrain,ytrain)
    yhat_rf = model_rf.predict(xtest)

    #Train and predict for XGB
    model_xgb.fit(xtrain,ytrain)
    yhat_xgb = model_xgb.predict(xtest)

    #Train and predict for neural network
    #Batch size devides the train data into batches, trains on each batch then updates the gradients until it gets through the whole data
    #Then moves onto the next epoch
    print('Split Number: ' + str(j))
    model_nn.fit(xtrain,ytrain,validation_split=0.3, epochs=100, batch_size=20, verbose=0, callbacks=[es])
    yhat_nn = model_nn.predict(xtest)

    #KernReg training and predictions
    model_KernReg = KernelReg(endog = ytrain.reshape(-1,1), exog = xtrain, var_type='ccc', ckertype='gaussian')
    yhat_kern, y_kern_std = model_KernReg.fit(xtest) 

    #Append each model's MSE
    mse_lwr.append(mse(ytest,yhat_lwr))
    mse_blwr.append(mse(ytest,yhat_blwr))
    mse_rf.append(mse(ytest,yhat_rf))
    mse_xgb.append(mse(ytest,yhat_xgb))
    mse_nn.append(mse(ytest,yhat_nn))
    mse_kern.append(mse(ytest,yhat_kern))

dict = {np.mean(mse_lwr) : 'LWR', np.mean(mse_blwr): "BLWR", np.mean(mse_rf): "RFR", np.mean(mse_xgb): "XGB",
                              np.mean(mse_nn): "NN", np.mean(mse_kern): "KernReg"}

print('\n')
print('The Cross-validated Mean Squared Error for LWR is : '+str(np.mean(mse_lwr)))
print('The Cross-validated Mean Squared Error for BLWR is : '+str(np.mean(mse_blwr)))
print('The Cross-validated Mean Squared Error for RF is : '+str(np.mean(mse_rf)))
print('The Cross-validated Mean Squared Error for XGB is : '+str(np.mean(mse_xgb)))
print('The Cross-validated Mean Squared Error for NN is : '+str(np.mean(mse_nn)))
print('The Cross-validated Mean Squared Error for KernReg is : '+str(np.mean(mse_kern)))

#print('The winner is :' + dict[np.min(dict.keys())])

KFold Validation Number: 1

Split Number: 1
Split Number: 2


The Cross-validated Mean Squared Error for LWR is : 18.424762629746425
The Cross-validated Mean Squared Error for BLWR is : 19.12014119219971
The Cross-validated Mean Squared Error for RF is : 17.636823308639023
The Cross-validated Mean Squared Error for XGB is : 17.88009879064699
The Cross-validated Mean Squared Error for NN is : 20.20466981812718
The Cross-validated Mean Squared Error for KernReg is : 17.73572469703669
