In [3]:
import numpy as np
import pandas as pd
from scipy.linalg import lstsq
from scipy.interpolate import interp1d, LinearNDInterpolator, NearestNDInterpolator
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import xgboost as xgb 
from sklearn.model_selection import train_test_split as tts

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping

In [9]:
#Tricubic Kernel, Lowess, and general Boosting definitions

def Tricubic(x):
  if len(x.shape) == 1:
    x = x.reshape(-1,1)
  d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>1,0,70/81*(1-d**3)**3)

def lw_reg(X, y, xnew, kern, tau, intercept):
    # tau is called bandwidth K((x-x[i])/(2*tau))
    n = len(X) # the number of observations
    yest = np.zeros(n)

    if len(y.shape)==1: # here we make column vectors
      y = y.reshape(-1,1)
    if len(X.shape)==1:
      X = X.reshape(-1,1)
    if intercept:
      X1 = np.column_stack([np.ones((len(X),1)),X])
    else:
      X1 = X

    w = np.array([kern((X - X[i])/(2*tau)) for i in range(n)]) # here we compute n vectors of weights

    #Looping through all X-points, solving for the predictions as linear combinations of inputs and weights matrix
    for i in range(n):          
        W = np.diag(w[:,i])
        b = np.transpose(X1).dot(W).dot(y)
        A = np.transpose(X1).dot(W).dot(X1)
        #A = A + 0.001*np.eye(X1.shape[1]) # if we want L2 regularization
        #theta = linalg.solve(A, b) # A*theta = b
        beta, res, rnk, s = lstsq(A, b)
        yest[i] = np.dot(X1[i],beta)
    if X.shape[1]==1:
      f = interp1d(X.flatten(),yest,fill_value='extrapolate')
    else:
      f = LinearNDInterpolator(X, yest)
    output = f(xnew) # the output may have NaN's where the data points from xnew are outside the convex hull of X
    if sum(np.isnan(output))>0:
      g = NearestNDInterpolator(X,y.ravel()) 
      # output[np.isnan(output)] = g(X[np.isnan(output)])
      output[np.isnan(output)] = g(xnew[np.isnan(output)])
    return output  

def rep_boosted_lwr(X, y, xtest, kern, tau, booster, nboost, intercept):
  yhat = lw_reg(X,y,X,kern,tau,intercept) #get loess predictions on training data
  yhat_test = lw_reg(X,y,xtest,kern,tau,intercept) #get loess predictions on testing data
  lw_error = y - yhat #find the loess training residuals; these are what the booster will train on
  #Below, fit the booster on train data and residuals, then add its predictions to the train/test predictions, then get new residuals
  for i in range(nboost): 
    booster.fit(X, lw_error)
    yhat += booster.predict(X)
    yhat_test += booster.predict(xtest)
    lw_error = y - yhat
  return yhat_test

def n_boost(X, y, xtest, model, nboost, booster, kern = None, tau = None, tau_b = None, 
            intercept = None, n_estimators=None , max_depth=None, model_nn = None):
  if booster == 'LWR':
    if model == 'LWR':
      yhat = lw_reg(X,y,X,kern,tau,intercept) #get loess predictions on training data
      yhat_test = lw_reg(X,y,xtest,kern,tau,intercept) #get loess predictions on testing data
      lw_error = y - yhat #find the loess training residuals; these are what the booster will train on
      for i in range(nboost): 
        yhat += lw_reg(X,lw_error,X,kern,tau_b,intercept)
        yhat_test += lw_reg(X,lw_error,xtest,kern,tau_b,intercept)
        lw_error = y - yhat
      return yhat_test

    if model == 'RF' or model == 'RFR':
      model_rf = RandomForestRegressor(n_estimators=n_estimators,max_depth=max_depth)
      model_rf.fit(X,y)
      yhat_rf = model_rf.predict(X)
      yhat_test = model_rf.predict(xtest)
      rf_error = y - yhat_rf
      for i in range(nboost): 
        yhat_rf += lw_reg(X,rf_error,X,kern,tau_b,intercept)
        yhat_test += lw_reg(X,rf_error,xtest,kern,tau_b,intercept)
        rf_error = y - yhat_rf
      return yhat_test

    if model == 'NN':
      model_nn.fit(X,y,validation_split=0.3, epochs=100, batch_size=20, verbose=0, callbacks=[es])
      yhat_nn = model_nn.predict(X)
      yhat_test = model_nn.predict(xtest)
      nn_error = y - yhat_nn
      for i in range(nboost): 
        yhat_nn += lw_reg(X,nn_error,X,kern,tau_b,intercept)
        yhat_test += lw_reg(X,nn_error,xtest,kern,tau_b,intercept)
        nn_error = y-yhat_nn
      return yhat_test

  else:
    if model == 'LWR':
      yhat = lw_reg(X,y,X,kern,tau,intercept) #get loess predictions on training data
      yhat_test = lw_reg(X,y,xtest,kern,tau,intercept) #get loess predictions on testing data
      lw_error = y - yhat #find the loess training residuals; these are what the booster will train on
      for i in range(nboost): 
        booster.fit(X, lw_error)
        yhat += booster.predict(X)
        yhat_test += booster.predict(xtest)
        lw_error = y - yhat
      return yhat_test

    if model == 'RF' or model == 'RFR':
      model_rf = RandomForestRegressor(n_estimators=n_estimators,max_depth=max_depth)
      model_rf.fit(X,y)
      yhat_rf = model_rf.predict(X)
      yhat_test = model_rf.predict(xtest)
      rf_error = y - yhat_rf
      for i in range(nboost): 
        booster.fit(X, rf_error)
        yhat_rf += booster.predict(X)
        yhat_test += booster.predict(xtest)
        rf_error = y - yhat_rf
      return yhat_test

    if model == 'NN':
      model_nn.fit(X,y,validation_split=0.3, epochs=100, batch_size=20, verbose=0, callbacks=[es])
      yhat_nn = model_nn.predict(X)
      yhat_test = model_nn.predict(xtest)
      nn_error = y - yhat_nn
      for i in range(nboost): 
        booster.fit(X, nn_error)
        yhat_nn += booster.predict(X)
        yhat_test += booster.predict(xtest)
        nn_error = y - yhat_nn
      return yhat_test

In [5]:
#Neural Network and XGB Architectures
model_nn = Sequential() #Making a tensorflow sequential network
model_nn.add(Dense(128, activation="relu", input_dim=6))
model_nn.add(Dropout(0.1))
model_nn.add(Dense(64, activation="relu"))
model_nn.add(Dropout(0.1))
model_nn.add(Dense(1, activation="linear"))
model_nn.compile(loss='mean_squared_error', optimizer=Adam(learning_rate= 0.001, decay = 0.0001))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=600)

booster_nn = Sequential()
booster_nn.add(Dense(32, activation="relu", input_dim=6))
booster_nn.add(Dropout(0.2))
booster_nn.add(Dense(16, activation="relu"))
booster_nn.add(Dropout(0.2))
booster_nn.add(Dense(1, activation="linear"))
booster_nn.compile(loss='mean_squared_error', optimizer=Adam(learning_rate= 0.001, decay = 0.0001))
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=600)

model_xgb = xgb.XGBRegressor(objective ='reg:squarederror',n_estimators=100,reg_lambda=20,alpha=1,gamma=10,max_depth=3)


In [6]:
data = pd.read_csv("Data/Concrete_Data.csv")
data = np.concatenate([data[data.columns[0:6]].values, data[data.columns[-1]].values.reshape(-1,1)], axis=1)
data.shape

(1030, 7)

In [10]:
#data Cross-Validation
mse_lwr_d = []
mse_lwr_rf = []
mse_rf_d = []
mse_lwr_lwr = []

scale = StandardScaler()
for i in range(12345,12346):
  print('Random State: ' + str(i))
  kf = KFold(n_splits=10,shuffle=True,random_state=i)
  # this is the random state cross-validation loop to make sure our results are real, not just the state being good/bad for a particular model
  j = 0
  for idxtrain, idxtest in kf.split(data[:,:2]):
    j += 1
    #Split the train and test data
    xtrain = data[:,:6][idxtrain]
    ytrain = data[:,-1][idxtrain]
    ytest = data[:,-1][idxtest]
    xtest = data[:,:6][idxtest]
    xtrain = scale.fit_transform(xtrain)
    xtest = scale.transform(xtest)
    print('Split Number: ' + str(j))

    #LWR boosted with decision tree
    booster = DecisionTreeRegressor(max_depth=2)
    yhat_lwr_d = n_boost(xtrain, ytrain, xtest, model = 'LWR', nboost=3, booster=booster, 
                   kern = Tricubic, tau = 1.2, intercept = True,)
    
    #LWR boosted with RF
    booster = RandomForestRegressor(n_estimators=25, max_depth=2)
    yhat_lwr_rf = n_boost(xtrain, ytrain, xtest, model = 'LWR', nboost=3, booster=booster, 
                   kern = Tricubic, tau = 1.2, intercept = True)

    #LWR boosted with LWR
    booster='LWR'
    yhat_lwr_lwr = n_boost(xtrain, ytrain, xtest, model = 'LWR', nboost=3, booster=booster, 
                   kern = Tricubic, tau = 1.2, tau_b=0.5, intercept = True)
    
    #RF boosted with decision tree
    booster = DecisionTreeRegressor(max_depth=2)
    yhat_rf_d = n_boost(xtrain, ytrain, xtest, model = 'RFR', nboost=3, booster=booster, n_estimators=100 , max_depth=3)
    
    #RF boosted with LWR
    booster = 'LWR'
    yhat_rf_d = n_boost(xtrain, ytrain, xtest, model = 'RFR', nboost=3, booster=booster, 
                   kern = Tricubic, tau = 1.2, tau_b=0.5, intercept = True, n_estimators=100 , max_depth=3)
    
    
    
    #Append each model's MSE
    mse_lwr_lwr.append(mse(ytest,yhat_lwr_lwr))
    mse_lwr_d.append(mse(ytest,yhat_lwr_d))
    mse_lwr_rf.append(mse(ytest,yhat_lwr_rf))
    mse_rf_d.append(mse(ytest,yhat_rf_d))

print('\n The Results for the Concrete Compressive Strength Dataset were:')
print('The Cross-validated Mean Squared Error for LWR with Decision Tree is : '+str(np.mean(mse_lwr_d)))
print('The Cross-validated Mean Squared Error for LWR with Random Forest is : '+str(np.mean(mse_lwr_rf)))
print('The Cross-validated Mean Squared Error for Random Forest with Decision Tree is : '+str(np.mean(mse_rf_d)))
print('The Cross-validated Mean Squared Error for LWR with LWR : '+str(np.mean(mse_lwr_lwr)))



Random State: 12345
Split Number: 1
Split Number: 2
Split Number: 3
Split Number: 4
Split Number: 5
Split Number: 6


## LightGBM

In [None]:
import lightgbm as lgbm
hyper_params = {

    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l1','l2'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 8,
    "num_leaves": 128,  
    "max_bin": 512,
    "num_iterations": 10000
}

lgbm = lgbm.LGBMRegressor(**hyper_params)

In [None]:
X_train, X_test, y_train, y_test = tts(data[:,:6], data[:,-1], test_size=0.2, random_state=42)

lgbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=100)

In [None]:
y_pred = lgbm.predict(X_train, num_iteration=gbm.best_iteration_)
# Basic RMSE
print('The rmse of prediction is:', round(mean_squared_log_error(y_pred, y_train) ** 0.5, 5))