In [7]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from pandas import read_csv
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load Boston Housing dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv('housing.xls', header=None, delimiter=r"\s+", names=column_names)
X = data.drop(columns=['MEDV'])
y = data['MEDV']

# Initialize XGBoost model
# model = xgb.XGBRegressor()

model = xgb.XGBRegressor(objective='reg:linear', colsample_bytree=0.3, learning_rate=0.1, max_depth=10, alpha=10, n_estimators=500, reg_lambda=2) #  0.2259856363283106

avg_r2 = []

for idx in range(100):

    # Define K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)

    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict
        predictions = model.predict(X_test)

        # Calculate MAPE
        mape = np.mean(np.abs((y_test-predictions)/y_test))
        mape_scores.append(mape)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        # Calculate R2
        r2 = r2_score(y_test, predictions)
        r2_scores.append(r2)

    # Output each fold's performance 
    for i in range(5):
        print(f"Fold {i+1}: MAPE = {mape_scores[i]} , RMSE = {rmse_scores[i]}, R2 = {r2_scores[i]}")
    # Output the average performance for 5 folds
    print(f"Average MAPE: {np.mean(mape_scores)}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print(f"Average R2: {np.mean(r2_scores)}")
    avg_r2.append(np.mean(r2_scores)) 

    # Calculate feature importance
    model.fit(X, y)
    feature_importance = model.feature_importances_
    print("Feature Importance:", feature_importance)

print(f"Average 100 R2: {np.mean(avg_r2)}")
# 1 all R2 = 0.9275
# Average 100 R2: 0.8509

Fold 1: MAPE = 0.1252673864218194 , RMSE = 3.424656302045579, R2 = 0.8794065069652048
Fold 2: MAPE = 0.10963158774716479 , RMSE = 3.6298198118573013, R2 = 0.8598460383672222
Fold 3: MAPE = 0.12011879982876723 , RMSE = 3.1943478368042073, R2 = 0.8089803221851577
Fold 4: MAPE = 0.12679219867487057 , RMSE = 3.97427999139764, R2 = 0.857783478698739
Fold 5: MAPE = 0.10141986732015476 , RMSE = 2.9030853139503088, R2 = 0.8605302475856391
Average MAPE: 0.11664596799855535
Average RMSE: 3.4252378512110075
Average R2: 0.8533093187603924
Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.1151875654377852 , RMSE = 3.711050214285561, R2 = 0.8579636945465061
Fold 2: MAPE = 0.10737725641430071 , RMSE = 3.055647030630349, R2 = 0.8716818503116642
Fold 3: MAPE = 0.135044388594586 , RMSE = 4.079542608760064, R2 = 0.8440580700438972
Fold 4: MAPE = 0.10254393503626208 , RMSE 

Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.10980711981835696 , RMSE = 3.8418343682453435, R2 = 0.8232353191642912
Fold 2: MAPE = 0.12182112760042357 , RMSE = 3.3972280466170406, R2 = 0.8592905004604298
Fold 3: MAPE = 0.11316589365795382 , RMSE = 3.774964824519957, R2 = 0.8011962687048672
Fold 4: MAPE = 0.11585338842413057 , RMSE = 2.6155260135904603, R2 = 0.8964466804309328
Fold 5: MAPE = 0.11217513708117088 , RMSE = 3.4234700127035866, R2 = 0.8974327717678604
Average MAPE: 0.11456453331640715
Average RMSE: 3.4106046531352776
Average R2: 0.8555203081056764
Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.10371041511806409 , RMSE = 3.1830286400365426, R2 = 0.8569870657807711
Fold 2: MAPE = 0.1199559567159372 , RMSE 

Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.11961716409982681 , RMSE = 3.618533579774967, R2 = 0.860445961038233
Fold 2: MAPE = 0.11253777398578294 , RMSE = 3.0357996340871956, R2 = 0.9100932794959113
Fold 3: MAPE = 0.10881365684974839 , RMSE = 2.9779242003927946, R2 = 0.8761887144564746
Fold 4: MAPE = 0.11727065419428993 , RMSE = 3.8471741343172203, R2 = 0.8346503174520625
Fold 5: MAPE = 0.12684659502448042 , RMSE = 3.6255478804834764, R2 = 0.7943874789896143
Average MAPE: 0.11701716883082569
Average RMSE: 3.4209958858111316
Average R2: 0.8551531502864593
Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.1432857496130468 , RMSE = 3.8993309166376537, R2 = 0.8639706658001625
Fold 2: MAPE = 0.11168178065335126 , RMSE =

Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.10515703629410846 , RMSE = 3.664655967813085, R2 = 0.8312590551080105
Fold 2: MAPE = 0.11062523591345548 , RMSE = 3.1509149355793094, R2 = 0.8333323175932031
Fold 3: MAPE = 0.12715549814580837 , RMSE = 4.409204716034758, R2 = 0.8217092597802308
Fold 4: MAPE = 0.11394997129284709 , RMSE = 3.420251334249093, R2 = 0.8709782438509741
Fold 5: MAPE = 0.12739094470000992 , RMSE = 2.9521602641498363, R2 = 0.8874834219431338
Average MAPE: 0.11685573726924585
Average RMSE: 3.5194374435652165
Average R2: 0.8489524596551105
Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.14034652458534513 , RMSE = 3.3865421410647207, R2 = 0.8118525518510005
Fold 2: MAPE = 0.11252528011826232 , RMSE =

Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.11892878528249724 , RMSE = 3.7691308529646075, R2 = 0.8518138159848956
Fold 2: MAPE = 0.12219065707130337 , RMSE = 3.4857726884199565, R2 = 0.8437134367750174
Fold 3: MAPE = 0.12194754803574256 , RMSE = 2.803522642045654, R2 = 0.9057379898330751
Fold 4: MAPE = 0.11242612963118567 , RMSE = 4.174439640566605, R2 = 0.8211224088071571
Fold 5: MAPE = 0.108517292235222 , RMSE = 3.3683885406252045, R2 = 0.8209989463251013
Average MAPE: 0.11680208245119017
Average RMSE: 3.520250872924405
Average R2: 0.8486773195450492
Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.11228021240383594 , RMSE = 4.813995602351683, R2 = 0.81060383240745
Fold 2: MAPE = 0.11471731474670595 , RMSE = 3.32

Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.12206147412345116 , RMSE = 4.009052351938175, R2 = 0.8498243294866151
Fold 2: MAPE = 0.11547958136132719 , RMSE = 3.402642246847527, R2 = 0.8303756337467662
Fold 3: MAPE = 0.11462175729201146 , RMSE = 3.3585051718592003, R2 = 0.8667430555423293
Fold 4: MAPE = 0.11919934055823682 , RMSE = 3.320583747020157, R2 = 0.8622761640893021
Fold 5: MAPE = 0.11869088294741902 , RMSE = 3.588371748298327, R2 = 0.8365532834246636
Average MAPE: 0.11801060725648913
Average RMSE: 3.5358310531926778
Average R2: 0.8491544932579351
Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.10971815436098095 , RMSE = 3.826057015326327, R2 = 0.7851316760905944
Fold 2: MAPE = 0.124702531526496 , RMSE = 3.3

Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.12200862624653734 , RMSE = 3.717717834035543, R2 = 0.861657259433217
Fold 2: MAPE = 0.13534784474534395 , RMSE = 3.502638271000172, R2 = 0.8521544767935554
Fold 3: MAPE = 0.10473521591419804 , RMSE = 3.698714217979462, R2 = 0.8341906499959733
Fold 4: MAPE = 0.12537245380739498 , RMSE = 3.549370707791369, R2 = 0.859507904134697
Fold 5: MAPE = 0.1118694143060394 , RMSE = 3.0784526226119637, R2 = 0.8542676547824263
Average MAPE: 0.11986671100390274
Average RMSE: 3.5093787306837014
Average R2: 0.8523555890279738
Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.12367062997196464 , RMSE = 3.026686074281285, R2 = 0.8825098445473354
Fold 2: MAPE = 0.10841919935682846 , RMSE = 4.11

Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.12208853822188896 , RMSE = 3.910727246870147, R2 = 0.8495380133086762
Fold 2: MAPE = 0.13099791172199463 , RMSE = 3.3602759286720887, R2 = 0.8335216607100879
Fold 3: MAPE = 0.10797536747970354 , RMSE = 3.193909653870505, R2 = 0.8535005725119338
Fold 4: MAPE = 0.1050951661748732 , RMSE = 2.833811231457163, R2 = 0.9064843208725466
Fold 5: MAPE = 0.13428416655661812 , RMSE = 3.899370713938077, R2 = 0.8426305961502583
Average MAPE: 0.12008823003101568
Average RMSE: 3.4396189549615963
Average R2: 0.8571350327107006
Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.12204029123668203 , RMSE = 3.635559027181285, R2 = 0.8439625552208134
Fold 2: MAPE = 0.11769229568078472 , RMSE = 3.

Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.13089713333746145 , RMSE = 4.433263299566432, R2 = 0.8200889620676173
Fold 2: MAPE = 0.11717935918917236 , RMSE = 3.4123366591154496, R2 = 0.8510828271971678
Fold 3: MAPE = 0.09386428758473297 , RMSE = 2.750192475141399, R2 = 0.8993977443024477
Fold 4: MAPE = 0.12030060269421017 , RMSE = 3.745801937193644, R2 = 0.8297206537493527
Fold 5: MAPE = 0.130090247171423 , RMSE = 3.1621514153523553, R2 = 0.8643816445271348
Average MAPE: 0.1184663259954
Average RMSE: 3.500749157273856
Average R2: 0.852934366368744
Feature Importance: [0.05599637 0.01292447 0.08155847 0.05063647 0.09661203 0.10583141
 0.0140959  0.03377081 0.03976984 0.19804266 0.00792919 0.05726223
 0.2455702 ]
Fold 1: MAPE = 0.09924097088366148 , RMSE = 3.3121552009272506, R2 = 0.8688571451748155
Fold 2: MAPE = 0.10593214792628754 , RMSE = 2.86389

In [8]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from pandas import read_csv
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load Boston Housing dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv('housing.xls', header=None, delimiter=r"\s+", names=column_names)
data = data.drop(columns=['CHAS'])
X = data.drop(columns=['MEDV'])
y = data['MEDV']

# Initialize XGBoost model
# model = xgb.XGBRegressor()

model = xgb.XGBRegressor(objective='reg:linear', colsample_bytree=0.3, learning_rate=0.1, max_depth=10, alpha=10, n_estimators=500, reg_lambda=2) #  0.2259856363283106

avg_r2 = []

for idx in range(100):

    # Define K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)

    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict
        predictions = model.predict(X_test)

        # Calculate MAPE
        mape = np.mean(np.abs((y_test-predictions)/y_test))
        mape_scores.append(mape)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        # Calculate R2
        r2 = r2_score(y_test, predictions)
        r2_scores.append(r2)

    # Output each fold's performance 
    for i in range(5):
        print(f"Fold {i+1}: MAPE = {mape_scores[i]} , RMSE = {rmse_scores[i]}, R2 = {r2_scores[i]}")
    # Output the average performance for 5 folds
    print(f"Average MAPE: {np.mean(mape_scores)}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print(f"Average R2: {np.mean(r2_scores)}")
    avg_r2.append(np.mean(r2_scores)) 

    # Calculate feature importance
    model.fit(X, y)
    feature_importance = model.feature_importances_
    print("Feature Importance:", feature_importance)

print(f"Average 100 R2: {np.mean(avg_r2)}")
# drop CHAS R2 = 0.9282
# Average 100 R2: 0.8570

Fold 1: MAPE = 0.0888556682528051 , RMSE = 2.64156664414244, R2 = 0.9177388199415338
Fold 2: MAPE = 0.10832006962740844 , RMSE = 3.9286912213152396, R2 = 0.821976484479668
Fold 3: MAPE = 0.1509597189223838 , RMSE = 3.81046573675899, R2 = 0.804987794750469
Fold 4: MAPE = 0.09976482183847253 , RMSE = 2.8462066101816705, R2 = 0.898579019888078
Fold 5: MAPE = 0.1141691982502569 , RMSE = 2.9113161782500456, R2 = 0.9092656172050285
Average MAPE: 0.11241389537826536
Average RMSE: 3.2276492781296775
Average R2: 0.8705095472529555
Feature Importance: [0.01965473 0.00777573 0.14210896 0.09458861 0.18452501 0.0253098
 0.05322867 0.03646983 0.0206613  0.16758561 0.01835125 0.22974053]
Fold 1: MAPE = 0.12855444359276133 , RMSE = 3.273301933838811, R2 = 0.8714806793685549
Fold 2: MAPE = 0.10722867270838331 , RMSE = 3.4268729890449676, R2 = 0.8683228645817681
Fold 3: MAPE = 0.0996274573854415 , RMSE = 3.0331670778302007, R2 = 0.8419510577559699
Fold 4: MAPE = 0.12165438175395599 , RMSE = 2.8835216212

Fold 1: MAPE = 0.13586261742668373 , RMSE = 4.29581588906832, R2 = 0.8187356303986455
Fold 2: MAPE = 0.09648592985916754 , RMSE = 3.1333131200872817, R2 = 0.8656388676809683
Fold 3: MAPE = 0.11755874838198825 , RMSE = 3.4890196144402634, R2 = 0.8336377113377923
Fold 4: MAPE = 0.11235123572872896 , RMSE = 2.6600073970888145, R2 = 0.9225981496249336
Fold 5: MAPE = 0.11451477364368301 , RMSE = 2.901221410771977, R2 = 0.896715132972445
Average MAPE: 0.1153546610080503
Average RMSE: 3.2958754862913318
Average R2: 0.8674650984029568
Feature Importance: [0.01965473 0.00777573 0.14210896 0.09458861 0.18452501 0.0253098
 0.05322867 0.03646983 0.0206613  0.16758561 0.01835125 0.22974053]
Fold 1: MAPE = 0.10532051838577265 , RMSE = 3.528996497430797, R2 = 0.8661118321648975
Fold 2: MAPE = 0.10118757670423065 , RMSE = 2.6008987453390096, R2 = 0.9105519338239887
Fold 3: MAPE = 0.10777644017530362 , RMSE = 3.034067601366913, R2 = 0.888289615832369
Fold 4: MAPE = 0.11024767202277143 , RMSE = 3.226923

Fold 1: MAPE = 0.11393899500856003 , RMSE = 3.664539318314096, R2 = 0.8397438377476227
Fold 2: MAPE = 0.13555651980805516 , RMSE = 4.356361374211584, R2 = 0.8091664962024477
Fold 3: MAPE = 0.12840526361069374 , RMSE = 3.029943296815297, R2 = 0.9013929260132302
Fold 4: MAPE = 0.12268395214470401 , RMSE = 3.2117453542990875, R2 = 0.8513330571710233
Fold 5: MAPE = 0.10639874603199584 , RMSE = 3.606543150392928, R2 = 0.8273655574699321
Average MAPE: 0.12139669532080177
Average RMSE: 3.5738264988065986
Average R2: 0.8458003749208511
Feature Importance: [0.01965473 0.00777573 0.14210896 0.09458861 0.18452501 0.0253098
 0.05322867 0.03646983 0.0206613  0.16758561 0.01835125 0.22974053]
Fold 1: MAPE = 0.15486105402177594 , RMSE = 3.828236855610014, R2 = 0.8662224691713609
Fold 2: MAPE = 0.11417516846124263 , RMSE = 4.0337243738496475, R2 = 0.7945358284421549
Fold 3: MAPE = 0.1106275970676094 , RMSE = 2.7593145230045124, R2 = 0.8943304042166459
Fold 4: MAPE = 0.08949107492591797 , RMSE = 2.7763

Fold 1: MAPE = 0.10305291705717505 , RMSE = 2.519479944759843, R2 = 0.8966966899899871
Fold 2: MAPE = 0.13055065426653742 , RMSE = 3.9353139835963136, R2 = 0.8437569818451321
Fold 3: MAPE = 0.11380859408101546 , RMSE = 2.851953260703096, R2 = 0.9000522858016388
Fold 4: MAPE = 0.14294679170241315 , RMSE = 4.957670919935467, R2 = 0.715044559155019
Fold 5: MAPE = 0.12862788891843469 , RMSE = 3.4433811650872035, R2 = 0.8696090582914916
Average MAPE: 0.12379736920511515
Average RMSE: 3.541559854816385
Average R2: 0.8450319150166538
Feature Importance: [0.01965473 0.00777573 0.14210896 0.09458861 0.18452501 0.0253098
 0.05322867 0.03646983 0.0206613  0.16758561 0.01835125 0.22974053]
Fold 1: MAPE = 0.14336339725813482 , RMSE = 3.6121808703662044, R2 = 0.8224760334041366
Fold 2: MAPE = 0.12028087041685945 , RMSE = 4.362988047924847, R2 = 0.8213618607663088
Fold 3: MAPE = 0.10285505399228237 , RMSE = 2.469181684485291, R2 = 0.9044235059552054
Fold 4: MAPE = 0.12733974044777177 , RMSE = 3.07627

Fold 1: MAPE = 0.11783095804810138 , RMSE = 3.959611234372988, R2 = 0.7886102145521956
Fold 2: MAPE = 0.11860118871552536 , RMSE = 3.3378369302869024, R2 = 0.8871000663328827
Fold 3: MAPE = 0.10010363603410774 , RMSE = 3.111768840858596, R2 = 0.8866415075250212
Fold 4: MAPE = 0.12529844536894513 , RMSE = 3.1771972842703398, R2 = 0.8716019834348947
Fold 5: MAPE = 0.1182497127240424 , RMSE = 3.35780591913588, R2 = 0.8666441055105594
Average MAPE: 0.1160167881781444
Average RMSE: 3.3888440417849415
Average R2: 0.8601195754711106
Feature Importance: [0.01965473 0.00777573 0.14210896 0.09458861 0.18452501 0.0253098
 0.05322867 0.03646983 0.0206613  0.16758561 0.01835125 0.22974053]
Fold 1: MAPE = 0.10132070484220003 , RMSE = 4.0792018510292944, R2 = 0.7921153861020378
Fold 2: MAPE = 0.12182120053819388 , RMSE = 3.7285360940643435, R2 = 0.8177113040705555
Fold 3: MAPE = 0.11598018154269531 , RMSE = 3.395271779587808, R2 = 0.843495112808162
Fold 4: MAPE = 0.11634846733933775 , RMSE = 3.092811

Fold 1: MAPE = 0.10878354486300985 , RMSE = 3.1564013727180202, R2 = 0.8782843451034184
Fold 2: MAPE = 0.1108556441601845 , RMSE = 3.371423355920233, R2 = 0.8677021433066283
Fold 3: MAPE = 0.10186258101481392 , RMSE = 2.810437770881892, R2 = 0.9066581584288088
Fold 4: MAPE = 0.1293140498319982 , RMSE = 3.081300383511169, R2 = 0.8855703483342551
Fold 5: MAPE = 0.11721198057805819 , RMSE = 3.844961182280473, R2 = 0.822343006014801
Average MAPE: 0.11360556008961294
Average RMSE: 3.252904813062358
Average R2: 0.8721116002375823
Feature Importance: [0.01965473 0.00777573 0.14210896 0.09458861 0.18452501 0.0253098
 0.05322867 0.03646983 0.0206613  0.16758561 0.01835125 0.22974053]
Fold 1: MAPE = 0.1381796798090126 , RMSE = 4.3572516615083625, R2 = 0.8214429588473553
Fold 2: MAPE = 0.09824962582277448 , RMSE = 2.7442391341956363, R2 = 0.8694626609426533
Fold 3: MAPE = 0.10823851454271939 , RMSE = 4.037389424696743, R2 = 0.8486749333606821
Fold 4: MAPE = 0.10925827311353278 , RMSE = 3.01821898

Fold 1: MAPE = 0.10240892275379047 , RMSE = 3.0040029118392444, R2 = 0.8542297466179893
Fold 2: MAPE = 0.09750004585703675 , RMSE = 2.9937492799509857, R2 = 0.8940431851147232
Fold 3: MAPE = 0.13243436462943417 , RMSE = 4.027268345051579, R2 = 0.7880396042971238
Fold 4: MAPE = 0.1591597731738236 , RMSE = 4.289603261441639, R2 = 0.81845719783351
Fold 5: MAPE = 0.11145254644422824 , RMSE = 3.4632552773712812, R2 = 0.876142675732714
Average MAPE: 0.12059113057166264
Average RMSE: 3.5555758151309456
Average R2: 0.8461824819192121
Feature Importance: [0.01965473 0.00777573 0.14210896 0.09458861 0.18452501 0.0253098
 0.05322867 0.03646983 0.0206613  0.16758561 0.01835125 0.22974053]
Fold 1: MAPE = 0.11826379689492332 , RMSE = 3.307952017405438, R2 = 0.8534977664069304
Fold 2: MAPE = 0.09196236843651616 , RMSE = 2.9889354436924163, R2 = 0.8906457851526148
Fold 3: MAPE = 0.1314270528080402 , RMSE = 3.4081060864647252, R2 = 0.8751311155722936
Fold 4: MAPE = 0.11961303836635152 , RMSE = 3.906087

Fold 1: MAPE = 0.1104883458449427 , RMSE = 4.461653729614571, R2 = 0.7678426091065207
Fold 2: MAPE = 0.12719188864503614 , RMSE = 3.260500307621827, R2 = 0.842466530717596
Fold 3: MAPE = 0.10834525943082148 , RMSE = 3.12400798653196, R2 = 0.8818641591666175
Fold 4: MAPE = 0.12363370415804893 , RMSE = 3.5787112566634134, R2 = 0.8796537127784927
Fold 5: MAPE = 0.09731180516235231 , RMSE = 2.744480566360947, R2 = 0.9006361187105091
Average MAPE: 0.11339420064824031
Average RMSE: 3.4338707693585433
Average R2: 0.8544926260959473
Feature Importance: [0.01965473 0.00777573 0.14210896 0.09458861 0.18452501 0.0253098
 0.05322867 0.03646983 0.0206613  0.16758561 0.01835125 0.22974053]
Fold 1: MAPE = 0.10094543634796548 , RMSE = 4.049429336391036, R2 = 0.8010490768836092
Fold 2: MAPE = 0.12868209572473335 , RMSE = 3.8988910487450705, R2 = 0.8277564477643977
Fold 3: MAPE = 0.12643727819948558 , RMSE = 3.0842054647138224, R2 = 0.8733707516018219
Fold 4: MAPE = 0.0891034134779859 , RMSE = 2.6729289

Fold 1: MAPE = 0.10935518724693354 , RMSE = 3.914009204190567, R2 = 0.776464814803485
Fold 2: MAPE = 0.11043915601959857 , RMSE = 2.981129868316215, R2 = 0.9216749638583503
Fold 3: MAPE = 0.12037061553139276 , RMSE = 2.9229651175694995, R2 = 0.8913292577098032
Fold 4: MAPE = 0.11787987792090573 , RMSE = 3.3356030116298356, R2 = 0.8446233816750296
Fold 5: MAPE = 0.10794124681579585 , RMSE = 3.008710643402702, R2 = 0.8967538328183551
Average MAPE: 0.11319721670692529
Average RMSE: 3.232483569021764
Average R2: 0.8661692501730046
Feature Importance: [0.01965473 0.00777573 0.14210896 0.09458861 0.18452501 0.0253098
 0.05322867 0.03646983 0.0206613  0.16758561 0.01835125 0.22974053]
Fold 1: MAPE = 0.14164908830157608 , RMSE = 4.249684021569547, R2 = 0.7915583848981357
Fold 2: MAPE = 0.13663209300191986 , RMSE = 3.1479740340181377, R2 = 0.8858895991231264
Fold 3: MAPE = 0.09463483070871276 , RMSE = 2.836205374687566, R2 = 0.8459799323250374
Fold 4: MAPE = 0.11557650747749054 , RMSE = 3.29650

In [9]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from pandas import read_csv
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load Boston Housing dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv('housing.xls', header=None, delimiter=r"\s+", names=column_names)
data = data.drop(columns=['CRIM'])
X = data.drop(columns=['MEDV'])
y = data['MEDV']

# Initialize XGBoost model
# model = xgb.XGBRegressor()

model = xgb.XGBRegressor(objective='reg:linear', colsample_bytree=0.3, learning_rate=0.1, max_depth=10, alpha=10, n_estimators=500, reg_lambda=2) #  0.2259856363283106

avg_r2 = []

for idx in range(100):

    # Define K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)

    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict
        predictions = model.predict(X_test)

        # Calculate MAPE
        mape = np.mean(np.abs((y_test-predictions)/y_test))
        mape_scores.append(mape)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        # Calculate R2
        r2 = r2_score(y_test, predictions)
        r2_scores.append(r2)

    # Output each fold's performance 
    for i in range(5):
        print(f"Fold {i+1}: MAPE = {mape_scores[i]} , RMSE = {rmse_scores[i]}, R2 = {r2_scores[i]}")
    # Output the average performance for 5 folds
    print(f"Average MAPE: {np.mean(mape_scores)}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print(f"Average R2: {np.mean(r2_scores)}")
    avg_r2.append(np.mean(r2_scores)) 

    # Calculate feature importance
    model.fit(X, y)
    feature_importance = model.feature_importances_
    print("Feature Importance:", feature_importance)

print(f"Average 100 R2: {np.mean(avg_r2)}")
# drop CRIM R2 = 0.9273
# Average 100 R2: 0.8464

Fold 1: MAPE = 0.10452247803473487 , RMSE = 3.1135601525690957, R2 = 0.8810938413604017
Fold 2: MAPE = 0.13244176187042625 , RMSE = 3.483742588704122, R2 = 0.8553311143970165
Fold 3: MAPE = 0.13808084018281894 , RMSE = 3.0714069229429195, R2 = 0.8950775507588797
Fold 4: MAPE = 0.12703024816387978 , RMSE = 4.127985936919848, R2 = 0.7676667330304893
Fold 5: MAPE = 0.12024030732219398 , RMSE = 4.262248692493748, R2 = 0.7990923700613313
Average MAPE: 0.12446312711481075
Average RMSE: 3.611788858725947
Average R2: 0.8396523219216236
Feature Importance: [0.03512119 0.07520266 0.06383844 0.09084431 0.17027879 0.02291364
 0.04686253 0.03354695 0.02131039 0.20055445 0.01809146 0.22143519]
Fold 1: MAPE = 0.11818192060583255 , RMSE = 4.718950196993466, R2 = 0.7585802796384233
Fold 2: MAPE = 0.11525830996159211 , RMSE = 2.876948338404671, R2 = 0.8870934078381784
Fold 3: MAPE = 0.09902471712382815 , RMSE = 3.282910794544572, R2 = 0.8769442018905359
Fold 4: MAPE = 0.1389266402865549 , RMSE = 3.64555

Fold 1: MAPE = 0.1369660264325887 , RMSE = 4.271638049909384, R2 = 0.8329450661484923
Fold 2: MAPE = 0.11252194869033647 , RMSE = 3.07151925373871, R2 = 0.8749921244184631
Fold 3: MAPE = 0.14514816664637065 , RMSE = 4.339201637117754, R2 = 0.8197149672643945
Fold 4: MAPE = 0.09581293583535336 , RMSE = 2.8167961227205325, R2 = 0.9120381221038709
Fold 5: MAPE = 0.0970058080582763 , RMSE = 2.5610762063997226, R2 = 0.8432524123094749
Average MAPE: 0.11749097713258509
Average RMSE: 3.4120462539772207
Average R2: 0.8565885384489391
Feature Importance: [0.03512119 0.07520266 0.06383844 0.09084431 0.17027879 0.02291364
 0.04686253 0.03354695 0.02131039 0.20055445 0.01809146 0.22143519]
Fold 1: MAPE = 0.12596900657329718 , RMSE = 3.228386488993288, R2 = 0.8883250170214039
Fold 2: MAPE = 0.10990043596744571 , RMSE = 3.0153797714446386, R2 = 0.8519164363915736
Fold 3: MAPE = 0.09662684905526367 , RMSE = 4.011920147059609, R2 = 0.807548593976331
Fold 4: MAPE = 0.11980405080475878 , RMSE = 3.099698

Fold 1: MAPE = 0.11936872685429967 , RMSE = 3.4089304614799065, R2 = 0.8813982784029427
Fold 2: MAPE = 0.11947025738476716 , RMSE = 3.5007223275654398, R2 = 0.8725470654875629
Fold 3: MAPE = 0.11309873771198414 , RMSE = 3.7740834455359193, R2 = 0.8145916181715086
Fold 4: MAPE = 0.12338262940385228 , RMSE = 3.661958600720108, R2 = 0.7945201643856776
Fold 5: MAPE = 0.1217140507026778 , RMSE = 3.5060273559225164, R2 = 0.8562474618391576
Average MAPE: 0.11940688041151622
Average RMSE: 3.5703444382447778
Average R2: 0.8438609176573699
Feature Importance: [0.03512119 0.07520266 0.06383844 0.09084431 0.17027879 0.02291364
 0.04686253 0.03354695 0.02131039 0.20055445 0.01809146 0.22143519]
Fold 1: MAPE = 0.10953926874995268 , RMSE = 4.016313707577926, R2 = 0.8274579360299494
Fold 2: MAPE = 0.10542180500534111 , RMSE = 2.9541657516657334, R2 = 0.8660234146774863
Fold 3: MAPE = 0.13707102910729307 , RMSE = 3.5998485214510794, R2 = 0.8628051696688399
Fold 4: MAPE = 0.12929498494421726 , RMSE = 3.

Fold 1: MAPE = 0.10454385274523952 , RMSE = 3.945801389478825, R2 = 0.8287465187719314
Fold 2: MAPE = 0.12314326694271402 , RMSE = 3.551640543320917, R2 = 0.8601137222060206
Fold 3: MAPE = 0.10044085722884627 , RMSE = 2.918639359733264, R2 = 0.8981895665519134
Fold 4: MAPE = 0.1392240574908335 , RMSE = 3.6720317209161926, R2 = 0.8293037338882343
Fold 5: MAPE = 0.11487472262494472 , RMSE = 3.1648306735509655, R2 = 0.8684681689136622
Average MAPE: 0.1164453514065156
Average RMSE: 3.4505887374000332
Average R2: 0.8569643420663524
Feature Importance: [0.03512119 0.07520266 0.06383844 0.09084431 0.17027879 0.02291364
 0.04686253 0.03354695 0.02131039 0.20055445 0.01809146 0.22143519]
Fold 1: MAPE = 0.1124111197876982 , RMSE = 3.465125676800845, R2 = 0.8481870990980571
Fold 2: MAPE = 0.10835667741772295 , RMSE = 2.642107997848351, R2 = 0.8937733354498866
Fold 3: MAPE = 0.12649828086515158 , RMSE = 3.7075037230139762, R2 = 0.8259388034331194
Fold 4: MAPE = 0.10870251360257642 , RMSE = 3.78904

Fold 1: MAPE = 0.12640960575142882 , RMSE = 3.6597908436516517, R2 = 0.7836241429683035
Fold 2: MAPE = 0.1331306527352299 , RMSE = 3.632558215368664, R2 = 0.8176316238883765
Fold 3: MAPE = 0.11342183562844958 , RMSE = 3.5549679063994954, R2 = 0.8802786937383568
Fold 4: MAPE = 0.11244562368087724 , RMSE = 2.9693531015544, R2 = 0.8939118545151928
Fold 5: MAPE = 0.122573188877014 , RMSE = 3.9652541338216882, R2 = 0.8314958004031596
Average MAPE: 0.1215961813345999
Average RMSE: 3.55638484015918
Average R2: 0.8413884231026778
Feature Importance: [0.03512119 0.07520266 0.06383844 0.09084431 0.17027879 0.02291364
 0.04686253 0.03354695 0.02131039 0.20055445 0.01809146 0.22143519]
Fold 1: MAPE = 0.10825326342224431 , RMSE = 3.2513105674679355, R2 = 0.8256988813988613
Fold 2: MAPE = 0.133548179478216 , RMSE = 4.199885713554187, R2 = 0.8325797382655349
Fold 3: MAPE = 0.1245205184169519 , RMSE = 3.498057840011317, R2 = 0.840888979100269
Fold 4: MAPE = 0.12989674699981255 , RMSE = 3.5747233002738

Fold 1: MAPE = 0.11206955077546212 , RMSE = 3.9702259568786817, R2 = 0.8607776828042524
Fold 2: MAPE = 0.11741435456064335 , RMSE = 3.6282173332278, R2 = 0.8321235516246054
Fold 3: MAPE = 0.13925203839843345 , RMSE = 3.289118931572144, R2 = 0.8655487970393192
Fold 4: MAPE = 0.11403680764474249 , RMSE = 3.101213348957802, R2 = 0.8608665441173331
Fold 5: MAPE = 0.1032260958794938 , RMSE = 3.3803963355861804, R2 = 0.8519728070261283
Average MAPE: 0.11719976945175503
Average RMSE: 3.4738343812445214
Average R2: 0.8542578765223278
Feature Importance: [0.03512119 0.07520266 0.06383844 0.09084431 0.17027879 0.02291364
 0.04686253 0.03354695 0.02131039 0.20055445 0.01809146 0.22143519]
Fold 1: MAPE = 0.12127771176180902 , RMSE = 3.795477207678655, R2 = 0.8464827764371623
Fold 2: MAPE = 0.12129002684009445 , RMSE = 3.3992184796128093, R2 = 0.8344429922705754
Fold 3: MAPE = 0.1110165919995123 , RMSE = 4.894859434235426, R2 = 0.7638359533837511
Fold 4: MAPE = 0.14965630446670686 , RMSE = 3.580592

Fold 1: MAPE = 0.12012375163879661 , RMSE = 3.3329733617244313, R2 = 0.865431257017291
Fold 2: MAPE = 0.12049115360333318 , RMSE = 3.270071797140658, R2 = 0.8522177527533198
Fold 3: MAPE = 0.13280479746432605 , RMSE = 4.40396091913692, R2 = 0.7915681166686992
Fold 4: MAPE = 0.11989340444862277 , RMSE = 3.584055996029339, R2 = 0.8285730464274909
Fold 5: MAPE = 0.1125342445864223 , RMSE = 3.986398892363766, R2 = 0.8387707553679926
Average MAPE: 0.1211694703483002
Average RMSE: 3.7154921932790232
Average R2: 0.8353121856469586
Feature Importance: [0.03512119 0.07520266 0.06383844 0.09084431 0.17027879 0.02291364
 0.04686253 0.03354695 0.02131039 0.20055445 0.01809146 0.22143519]
Fold 1: MAPE = 0.10484886099529954 , RMSE = 3.0557698058846148, R2 = 0.8787025736692423
Fold 2: MAPE = 0.10871168298284219 , RMSE = 3.4456427630071502, R2 = 0.8870736397204944
Fold 3: MAPE = 0.12176248598699481 , RMSE = 3.487143253128826, R2 = 0.8430083550175197
Fold 4: MAPE = 0.1340188950087546 , RMSE = 3.2161093

Fold 1: MAPE = 0.12385629286067147 , RMSE = 2.99892609597839, R2 = 0.8882028593546515
Fold 2: MAPE = 0.13817309682969517 , RMSE = 3.911826230625511, R2 = 0.8431165676838581
Fold 3: MAPE = 0.09861991491207737 , RMSE = 2.578537417687938, R2 = 0.9232369555316219
Fold 4: MAPE = 0.10449230844164822 , RMSE = 3.991662396923156, R2 = 0.8138445948578977
Fold 5: MAPE = 0.11849964895128494 , RMSE = 3.5912583769314415, R2 = 0.8154932673740155
Average MAPE: 0.11672825239907544
Average RMSE: 3.4144421036292876
Average R2: 0.8567788489604089
Feature Importance: [0.03512119 0.07520266 0.06383844 0.09084431 0.17027879 0.02291364
 0.04686253 0.03354695 0.02131039 0.20055445 0.01809146 0.22143519]
Fold 1: MAPE = 0.12161353524117087 , RMSE = 3.398633268295242, R2 = 0.8829875124858738
Fold 2: MAPE = 0.10869886427567814 , RMSE = 4.7609892099866125, R2 = 0.7952975870079746
Fold 3: MAPE = 0.10286081855228568 , RMSE = 2.7615908058156546, R2 = 0.8901360447737428
Fold 4: MAPE = 0.11792944002863623 , RMSE = 3.193

Fold 1: MAPE = 0.10989794627006881 , RMSE = 4.229261078426754, R2 = 0.812061876424583
Fold 2: MAPE = 0.1296980517261569 , RMSE = 3.435323380881566, R2 = 0.85475581460225
Fold 3: MAPE = 0.1360202253792597 , RMSE = 3.602151909844044, R2 = 0.8354529129847088
Fold 4: MAPE = 0.12090812541346821 , RMSE = 3.3183673450447126, R2 = 0.8604945473312076
Fold 5: MAPE = 0.11368377959468134 , RMSE = 3.3327009622086057, R2 = 0.8704417570258536
Average MAPE: 0.12204162567672698
Average RMSE: 3.583560935281137
Average R2: 0.8466413816737205
Feature Importance: [0.03512119 0.07520266 0.06383844 0.09084431 0.17027879 0.02291364
 0.04686253 0.03354695 0.02131039 0.20055445 0.01809146 0.22143519]
Fold 1: MAPE = 0.11561632807574566 , RMSE = 3.0210698416171016, R2 = 0.8695080211227427
Fold 2: MAPE = 0.13459887130393455 , RMSE = 4.356895851221198, R2 = 0.8098137292935018
Fold 3: MAPE = 0.13431727166523302 , RMSE = 4.759216820419634, R2 = 0.7433378658973593
Fold 4: MAPE = 0.10759099916817637 , RMSE = 2.73532596

In [10]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from pandas import read_csv
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load Boston Housing dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv('housing.xls', header=None, delimiter=r"\s+", names=column_names)
data = data.drop(columns=['ZN'])
X = data.drop(columns=['MEDV'])
y = data['MEDV']

# Initialize XGBoost model
# model = xgb.XGBRegressor()

model = xgb.XGBRegressor(objective='reg:linear', colsample_bytree=0.3, learning_rate=0.1, max_depth=10, alpha=10, n_estimators=500, reg_lambda=2) #  0.2259856363283106

avg_r2 = []

for idx in range(100):

    # Define K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)

    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict
        predictions = model.predict(X_test)

        # Calculate MAPE
        mape = np.mean(np.abs((y_test-predictions)/y_test))
        mape_scores.append(mape)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        # Calculate R2
        r2 = r2_score(y_test, predictions)
        r2_scores.append(r2)

    # Output each fold's performance 
    for i in range(5):
        print(f"Fold {i+1}: MAPE = {mape_scores[i]} , RMSE = {rmse_scores[i]}, R2 = {r2_scores[i]}")
    # Output the average performance for 5 folds
    print(f"Average MAPE: {np.mean(mape_scores)}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print(f"Average R2: {np.mean(r2_scores)}")
    avg_r2.append(np.mean(r2_scores)) 

    # Calculate feature importance
    model.fit(X, y)
    feature_importance = model.feature_importances_
    print("Feature Importance:", feature_importance)

print(f"Average 100 R2: {np.mean(avg_r2)}")
# drop ZN R2 = 0.9328
# Average 100 R2: 0.8551

Fold 1: MAPE = 0.1019229068063625 , RMSE = 4.677527125744438, R2 = 0.7500003831118988
Fold 2: MAPE = 0.11567461331346765 , RMSE = 3.4440206640750954, R2 = 0.8831463334233298
Fold 3: MAPE = 0.10363767565796805 , RMSE = 2.9132214705073403, R2 = 0.917462613269439
Fold 4: MAPE = 0.13537200249855588 , RMSE = 3.080703577341813, R2 = 0.8194169297674009
Fold 5: MAPE = 0.129236403390328 , RMSE = 3.456476046980329, R2 = 0.8375885068183184
Average MAPE: 0.11716872033333643
Average RMSE: 3.5143897769298027
Average R2: 0.8415229532780775
Feature Importance: [0.0164643  0.08131067 0.0526744  0.09408196 0.1774059  0.02301562
 0.0499284  0.03114054 0.02130955 0.2061543  0.01867778 0.2278366 ]
Fold 1: MAPE = 0.11897283419410432 , RMSE = 4.076469150846799, R2 = 0.817563992411078
Fold 2: MAPE = 0.10209615525151175 , RMSE = 2.5663761108056704, R2 = 0.9156406562106194
Fold 3: MAPE = 0.1256050963597969 , RMSE = 3.1048422998999348, R2 = 0.878568851330545
Fold 4: MAPE = 0.11196593127157645 , RMSE = 2.98841432

Fold 1: MAPE = 0.09911962960466288 , RMSE = 3.1491726270974385, R2 = 0.8820035826591273
Fold 2: MAPE = 0.11796028507388737 , RMSE = 3.807187950921429, R2 = 0.8710854817045379
Fold 3: MAPE = 0.1264516567995059 , RMSE = 4.131463967991434, R2 = 0.7702558549235617
Fold 4: MAPE = 0.11993724406959413 , RMSE = 3.083557922090469, R2 = 0.8867708962917932
Fold 5: MAPE = 0.11957765804347989 , RMSE = 2.986411965464557, R2 = 0.8667252844876937
Average MAPE: 0.11660929471822604
Average RMSE: 3.4315588867130655
Average R2: 0.8553682200133428
Feature Importance: [0.0164643  0.08131067 0.0526744  0.09408196 0.1774059  0.02301562
 0.0499284  0.03114054 0.02130955 0.2061543  0.01867778 0.2278366 ]
Fold 1: MAPE = 0.10147396675534334 , RMSE = 3.2678319982755117, R2 = 0.86244260724876
Fold 2: MAPE = 0.10422519699950186 , RMSE = 3.0513983000337523, R2 = 0.8967537999334185
Fold 3: MAPE = 0.11110262986706905 , RMSE = 2.704349161973604, R2 = 0.9265905092401359
Fold 4: MAPE = 0.13255839833690936 , RMSE = 3.99178

Fold 1: MAPE = 0.09430720351561589 , RMSE = 3.8107859915738387, R2 = 0.8308887686668308
Fold 2: MAPE = 0.1133950220325541 , RMSE = 3.2061380417248726, R2 = 0.8072252923744142
Fold 3: MAPE = 0.13547727648871552 , RMSE = 3.5376625858016206, R2 = 0.8798325242380868
Fold 4: MAPE = 0.11371445364260767 , RMSE = 3.051286630194439, R2 = 0.8868920529678912
Fold 5: MAPE = 0.10528058852735511 , RMSE = 3.2048519356832372, R2 = 0.8914111622385192
Average MAPE: 0.11243490884136967
Average RMSE: 3.3621450369956016
Average R2: 0.8592499600971484
Feature Importance: [0.0164643  0.08131067 0.0526744  0.09408196 0.1774059  0.02301562
 0.0499284  0.03114054 0.02130955 0.2061543  0.01867778 0.2278366 ]
Fold 1: MAPE = 0.13848667493103273 , RMSE = 3.943363162346849, R2 = 0.8523134904130801
Fold 2: MAPE = 0.09551966536662514 , RMSE = 2.8164543661968864, R2 = 0.907831886686814
Fold 3: MAPE = 0.1358895791515492 , RMSE = 3.602214069049429, R2 = 0.8501600281792637
Fold 4: MAPE = 0.11100037503871382 , RMSE = 3.259

Fold 1: MAPE = 0.13738982446860235 , RMSE = 4.031417864367653, R2 = 0.8044767889869879
Fold 2: MAPE = 0.10359679024610477 , RMSE = 3.514188922296045, R2 = 0.8614090061648789
Fold 3: MAPE = 0.10080808843120503 , RMSE = 3.2853860386404588, R2 = 0.8774432533214802
Fold 4: MAPE = 0.11363186473363604 , RMSE = 3.360839848826054, R2 = 0.8740630107806622
Fold 5: MAPE = 0.14035135711476426 , RMSE = 3.5036516860236135, R2 = 0.82038583406154
Average MAPE: 0.1191555849988625
Average RMSE: 3.5390968720307647
Average R2: 0.8475555786631098
Feature Importance: [0.0164643  0.08131067 0.0526744  0.09408196 0.1774059  0.02301562
 0.0499284  0.03114054 0.02130955 0.2061543  0.01867778 0.2278366 ]
Fold 1: MAPE = 0.11714834653354278 , RMSE = 2.990233553192169, R2 = 0.8617195093337658
Fold 2: MAPE = 0.12404538548577719 , RMSE = 3.8999098645228774, R2 = 0.8235586901295373
Fold 3: MAPE = 0.12488752303674522 , RMSE = 3.4169274737850106, R2 = 0.8699484943576965
Fold 4: MAPE = 0.12347354085263602 , RMSE = 3.7156

Fold 1: MAPE = 0.10731854637708453 , RMSE = 3.5664633250071844, R2 = 0.856427868635697
Fold 2: MAPE = 0.12089121437748938 , RMSE = 3.4272148899245507, R2 = 0.8540861941121469
Fold 3: MAPE = 0.11235637316676643 , RMSE = 4.145470786343809, R2 = 0.8476466044569277
Fold 4: MAPE = 0.1437706297090573 , RMSE = 3.1140843864832313, R2 = 0.8428762715801117
Fold 5: MAPE = 0.10394412648096417 , RMSE = 2.6220932482181687, R2 = 0.9023461916303142
Average MAPE: 0.11765617802227235
Average RMSE: 3.3750653271953888
Average R2: 0.8606766260830394
Feature Importance: [0.0164643  0.08131067 0.0526744  0.09408196 0.1774059  0.02301562
 0.0499284  0.03114054 0.02130955 0.2061543  0.01867778 0.2278366 ]
Fold 1: MAPE = 0.11025568124951299 , RMSE = 3.675442721068056, R2 = 0.8443489382993531
Fold 2: MAPE = 0.1204016617054869 , RMSE = 3.691049175684047, R2 = 0.8567477552283678
Fold 3: MAPE = 0.13136091011655487 , RMSE = 3.7872384197726467, R2 = 0.8501589163066742
Fold 4: MAPE = 0.11106329133709965 , RMSE = 2.958

Fold 1: MAPE = 0.11857140905498433 , RMSE = 3.066131853898881, R2 = 0.8661571346195296
Fold 2: MAPE = 0.11139613120532345 , RMSE = 3.229045430538463, R2 = 0.8876547993668146
Fold 3: MAPE = 0.1223338512313821 , RMSE = 3.4350140123979185, R2 = 0.8675257127633124
Fold 4: MAPE = 0.1221341513587742 , RMSE = 3.931975783251136, R2 = 0.813594242869053
Fold 5: MAPE = 0.11623528523848421 , RMSE = 3.4272480578975846, R2 = 0.8618238918667397
Average MAPE: 0.11813416561778967
Average RMSE: 3.4178830275967966
Average R2: 0.8593511562970898
Feature Importance: [0.0164643  0.08131067 0.0526744  0.09408196 0.1774059  0.02301562
 0.0499284  0.03114054 0.02130955 0.2061543  0.01867778 0.2278366 ]
Fold 1: MAPE = 0.12120204559002683 , RMSE = 5.2501396898028645, R2 = 0.7491011685574587
Fold 2: MAPE = 0.09578482837850795 , RMSE = 2.91062332425093, R2 = 0.8810692673814607
Fold 3: MAPE = 0.12238009092648142 , RMSE = 3.234273652026466, R2 = 0.8217668152359507
Fold 4: MAPE = 0.12983088883998303 , RMSE = 3.462454

Fold 1: MAPE = 0.10465327099140302 , RMSE = 3.2112852315090583, R2 = 0.871052972803827
Fold 2: MAPE = 0.13771078856522387 , RMSE = 3.856997807441517, R2 = 0.8190063937629752
Fold 3: MAPE = 0.13070304500872387 , RMSE = 3.490245959167387, R2 = 0.8753486970993395
Fold 4: MAPE = 0.11246701795204553 , RMSE = 3.5289632255041563, R2 = 0.8104033454220383
Fold 5: MAPE = 0.12271405490440634 , RMSE = 3.277401762293077, R2 = 0.8849571612077014
Average MAPE: 0.12164963548436052
Average RMSE: 3.472978797183039
Average R2: 0.8521537140591763
Feature Importance: [0.0164643  0.08131067 0.0526744  0.09408196 0.1774059  0.02301562
 0.0499284  0.03114054 0.02130955 0.2061543  0.01867778 0.2278366 ]
Fold 1: MAPE = 0.11991284026743551 , RMSE = 3.3266184199801025, R2 = 0.8839301032759754
Fold 2: MAPE = 0.12325675509985143 , RMSE = 2.9380419186667375, R2 = 0.8303300474787214
Fold 3: MAPE = 0.12772283483498698 , RMSE = 3.6307239551805113, R2 = 0.8772787688951585
Fold 4: MAPE = 0.11138497724992398 , RMSE = 3.96

Fold 1: MAPE = 0.13559068918316475 , RMSE = 4.1082943570464945, R2 = 0.8142324814441987
Fold 2: MAPE = 0.07503799098120581 , RMSE = 2.443397786962976, R2 = 0.9167960528494012
Fold 3: MAPE = 0.12605027819445624 , RMSE = 3.388737551406505, R2 = 0.8699667490750037
Fold 4: MAPE = 0.10454843416812032 , RMSE = 3.6205375661262633, R2 = 0.8190546127254859
Fold 5: MAPE = 0.15738389983495915 , RMSE = 4.011525722473778, R2 = 0.8291239738944014
Average MAPE: 0.11972225847238124
Average RMSE: 3.514498596803203
Average R2: 0.8498347739976981
Feature Importance: [0.0164643  0.08131067 0.0526744  0.09408196 0.1774059  0.02301562
 0.0499284  0.03114054 0.02130955 0.2061543  0.01867778 0.2278366 ]
Fold 1: MAPE = 0.10145666666164763 , RMSE = 4.006890366639769, R2 = 0.8423440629896881
Fold 2: MAPE = 0.12039311959686372 , RMSE = 3.04289594096429, R2 = 0.8784516332402162
Fold 3: MAPE = 0.09950364525947385 , RMSE = 2.736058191375939, R2 = 0.8831755055686855
Fold 4: MAPE = 0.13814082296236133 , RMSE = 3.44903

Fold 1: MAPE = 0.13862964205252404 , RMSE = 4.18822912595483, R2 = 0.7977721582143511
Fold 2: MAPE = 0.11114500553689048 , RMSE = 3.4329657573411256, R2 = 0.8147569003453851
Fold 3: MAPE = 0.10548065323954045 , RMSE = 2.7759467312012203, R2 = 0.8995816175934723
Fold 4: MAPE = 0.11536856440194425 , RMSE = 3.3073900602345803, R2 = 0.8745253562573927
Fold 5: MAPE = 0.11827656640108784 , RMSE = 3.704955975053011, R2 = 0.8699341927934774
Average MAPE: 0.1177800863263974
Average RMSE: 3.481897529956954
Average R2: 0.8513140450408156
Feature Importance: [0.0164643  0.08131067 0.0526744  0.09408196 0.1774059  0.02301562
 0.0499284  0.03114054 0.02130955 0.2061543  0.01867778 0.2278366 ]
Fold 1: MAPE = 0.1279296318492421 , RMSE = 3.4348681445309284, R2 = 0.8811963011246153
Fold 2: MAPE = 0.12060467112212492 , RMSE = 3.3138815932351533, R2 = 0.8152512324084711
Fold 3: MAPE = 0.10748186085079577 , RMSE = 3.285109146512983, R2 = 0.8842390787210157
Fold 4: MAPE = 0.10390007945868734 , RMSE = 3.3298

In [11]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from pandas import read_csv
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load Boston Housing dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv('housing.xls', header=None, delimiter=r"\s+", names=column_names)
data = data.drop(columns=['CHAS','ZN'])
X = data.drop(columns=['MEDV'])
y = data['MEDV']

# Initialize XGBoost model
# model = xgb.XGBRegressor()

model = xgb.XGBRegressor(objective='reg:linear', colsample_bytree=0.3, learning_rate=0.1, max_depth=10, alpha=10, n_estimators=500, reg_lambda=2) #  0.2259856363283106

avg_r2 = []

for idx in range(100):

    # Define K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)

    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict
        predictions = model.predict(X_test)

        # Calculate MAPE
        mape = np.mean(np.abs((y_test-predictions)/y_test))
        mape_scores.append(mape)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        # Calculate R2
        r2 = r2_score(y_test, predictions)
        r2_scores.append(r2)

    # Output each fold's performance 
    for i in range(5):
        print(f"Fold {i+1}: MAPE = {mape_scores[i]} , RMSE = {rmse_scores[i]}, R2 = {r2_scores[i]}")
    # Output the average performance for 5 folds
    print(f"Average MAPE: {np.mean(mape_scores)}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print(f"Average R2: {np.mean(r2_scores)}")
    avg_r2.append(np.mean(r2_scores)) 

    # Calculate feature importance
    model.fit(X, y)
    feature_importance = model.feature_importances_
    print("Feature Importance:", feature_importance)

print(f"Average 100 R2: {np.mean(avg_r2)}")
# drop CHAS and ZN R2 = 0.9378
# Average 100 R2: 0.8678

Fold 1: MAPE = 0.1255184171328137 , RMSE = 4.741705063255072, R2 = 0.7256920076944541
Fold 2: MAPE = 0.13127979171383203 , RMSE = 3.6635715596119947, R2 = 0.8546642292957778
Fold 3: MAPE = 0.09896674116098211 , RMSE = 2.8622050435740714, R2 = 0.9097825216448784
Fold 4: MAPE = 0.11140377716931285 , RMSE = 3.1957512090196527, R2 = 0.8553634780679793
Fold 5: MAPE = 0.09496025353443786 , RMSE = 2.502481906275448, R2 = 0.9268122265019969
Average MAPE: 0.1124257961422757
Average RMSE: 3.3931429563472477
Average R2: 0.8544628926410173
Feature Importance: [0.04659854 0.11439081 0.08344424 0.21859221 0.00991015 0.03528416
 0.01159328 0.03701961 0.15205887 0.01924347 0.27186468]
Fold 1: MAPE = 0.11301781081890501 , RMSE = 3.547639403044205, R2 = 0.8909114209530987
Fold 2: MAPE = 0.12713163104070163 , RMSE = 3.6677928194423774, R2 = 0.8492512886583629
Fold 3: MAPE = 0.10888965605778089 , RMSE = 3.9174654878440336, R2 = 0.829195341564307
Fold 4: MAPE = 0.11729554016359878 , RMSE = 2.93006323622716

Feature Importance: [0.04659854 0.11439081 0.08344424 0.21859221 0.00991015 0.03528416
 0.01159328 0.03701961 0.15205887 0.01924347 0.27186468]
Fold 1: MAPE = 0.11050621927508035 , RMSE = 3.0010069648009163, R2 = 0.8855318246194933
Fold 2: MAPE = 0.1119235545860065 , RMSE = 2.8570630012535503, R2 = 0.9163253987885178
Fold 3: MAPE = 0.1360698247714047 , RMSE = 4.9987629996044225, R2 = 0.7140580102132004
Fold 4: MAPE = 0.09484305945097433 , RMSE = 2.734385453372776, R2 = 0.9108207776478816
Fold 5: MAPE = 0.09889584167440772 , RMSE = 3.2108047135459032, R2 = 0.8600674890079651
Average MAPE: 0.11044769995157473
Average RMSE: 3.360404626515513
Average R2: 0.8573607000554115
Feature Importance: [0.04659854 0.11439081 0.08344424 0.21859221 0.00991015 0.03528416
 0.01159328 0.03701961 0.15205887 0.01924347 0.27186468]
Fold 1: MAPE = 0.12819497472569907 , RMSE = 4.108848957440877, R2 = 0.8503780041142068
Fold 2: MAPE = 0.10983108446334726 , RMSE = 4.640485739492477, R2 = 0.7307509941574575
Fold

Fold 1: MAPE = 0.11223991210920307 , RMSE = 4.0328739078959, R2 = 0.7892807379293448
Fold 2: MAPE = 0.11775149324144504 , RMSE = 3.2263766000979928, R2 = 0.8795372464474219
Fold 3: MAPE = 0.09327628446289772 , RMSE = 2.51239073389829, R2 = 0.9082173978133476
Fold 4: MAPE = 0.11029148727005511 , RMSE = 2.892061329832691, R2 = 0.9019188570521376
Fold 5: MAPE = 0.11662201574219341 , RMSE = 3.549098165470657, R2 = 0.8777106772171603
Average MAPE: 0.11003623856515887
Average RMSE: 3.2425601474391064
Average R2: 0.8713329832918826
Feature Importance: [0.04659854 0.11439081 0.08344424 0.21859221 0.00991015 0.03528416
 0.01159328 0.03701961 0.15205887 0.01924347 0.27186468]
Fold 1: MAPE = 0.13438132690204277 , RMSE = 3.0801994607190397, R2 = 0.8934981861004633
Fold 2: MAPE = 0.1038374786490373 , RMSE = 2.6050270176614636, R2 = 0.9017250468659068
Fold 3: MAPE = 0.10710167560963814 , RMSE = 2.777678349712852, R2 = 0.9073102996872991
Fold 4: MAPE = 0.10562525825874995 , RMSE = 3.0013692909951977,

Feature Importance: [0.04659854 0.11439081 0.08344424 0.21859221 0.00991015 0.03528416
 0.01159328 0.03701961 0.15205887 0.01924347 0.27186468]
Fold 1: MAPE = 0.09905672762956708 , RMSE = 3.778887923703, R2 = 0.8509381026793199
Fold 2: MAPE = 0.11914561310962148 , RMSE = 2.9006175608774183, R2 = 0.8751112116108918
Fold 3: MAPE = 0.09262273907084713 , RMSE = 2.801951676678769, R2 = 0.8971853912686942
Fold 4: MAPE = 0.10819360925263724 , RMSE = 3.5706840728810296, R2 = 0.8628610085081316
Fold 5: MAPE = 0.12813224766499148 , RMSE = 2.9680371121379223, R2 = 0.8986931539854607
Average MAPE: 0.10943018734553286
Average RMSE: 3.204035669255627
Average R2: 0.8769577736104998
Feature Importance: [0.04659854 0.11439081 0.08344424 0.21859221 0.00991015 0.03528416
 0.01159328 0.03701961 0.15205887 0.01924347 0.27186468]
Fold 1: MAPE = 0.11236718785867474 , RMSE = 2.9195226435548105, R2 = 0.9094369800943513
Fold 2: MAPE = 0.11006882303352815 , RMSE = 2.8905821435518795, R2 = 0.9019413913870711
Fold

Fold 1: MAPE = 0.10548188426519604 , RMSE = 2.703161742673259, R2 = 0.8796115160501216
Fold 2: MAPE = 0.09961483344425201 , RMSE = 3.0372537535362683, R2 = 0.8880591260450117
Fold 3: MAPE = 0.12771848348013118 , RMSE = 4.328073918819523, R2 = 0.828690640660255
Fold 4: MAPE = 0.10174848917894468 , RMSE = 3.0896095160320156, R2 = 0.8938633161997324
Fold 5: MAPE = 0.11372029256020688 , RMSE = 2.8570419642531886, R2 = 0.8948896772905728
Average MAPE: 0.10965679658574615
Average RMSE: 3.2030281790628505
Average R2: 0.8770228552491387
Feature Importance: [0.04659854 0.11439081 0.08344424 0.21859221 0.00991015 0.03528416
 0.01159328 0.03701961 0.15205887 0.01924347 0.27186468]
Fold 1: MAPE = 0.12729699277829237 , RMSE = 3.34999157018877, R2 = 0.8892022571927685
Fold 2: MAPE = 0.08752444317532282 , RMSE = 2.9598152660913737, R2 = 0.8640285441362859
Fold 3: MAPE = 0.11653378425546615 , RMSE = 3.947825198967877, R2 = 0.7997987986960176
Fold 4: MAPE = 0.10624401996682713 , RMSE = 2.96819236561701

Feature Importance: [0.04659854 0.11439081 0.08344424 0.21859221 0.00991015 0.03528416
 0.01159328 0.03701961 0.15205887 0.01924347 0.27186468]
Fold 1: MAPE = 0.1221315567256943 , RMSE = 3.1867185060105996, R2 = 0.8935123773487528
Fold 2: MAPE = 0.1160769137013485 , RMSE = 3.68859961262667, R2 = 0.8651662928257324
Fold 3: MAPE = 0.11133177302202074 , RMSE = 3.1440807893690788, R2 = 0.8607710804070738
Fold 4: MAPE = 0.10908144415766818 , RMSE = 2.9106031505119936, R2 = 0.8748219198958485
Fold 5: MAPE = 0.10642788508105595 , RMSE = 3.5843947543715147, R2 = 0.8480740313551349
Average MAPE: 0.11300991453755753
Average RMSE: 3.302879362577971
Average R2: 0.8684691403665085
Feature Importance: [0.04659854 0.11439081 0.08344424 0.21859221 0.00991015 0.03528416
 0.01159328 0.03701961 0.15205887 0.01924347 0.27186468]
Fold 1: MAPE = 0.12136683950046444 , RMSE = 4.601860420555767, R2 = 0.7273736502600763
Fold 2: MAPE = 0.1043967915908514 , RMSE = 3.3119018232102815, R2 = 0.8526265965599304
Fold 

Fold 1: MAPE = 0.11007426509663741 , RMSE = 3.034262175601162, R2 = 0.8533993145307575
Fold 2: MAPE = 0.1026917314591319 , RMSE = 2.946797088961545, R2 = 0.8871021000341284
Fold 3: MAPE = 0.11825697695982557 , RMSE = 4.270804590817681, R2 = 0.8349249037916714
Fold 4: MAPE = 0.09969155272795271 , RMSE = 2.8220926161729936, R2 = 0.9030673622023528
Fold 5: MAPE = 0.13767621863717813 , RMSE = 3.151245538488554, R2 = 0.8840676157777834
Average MAPE: 0.11367814897614514
Average RMSE: 3.2450404020083874
Average R2: 0.8725122592673387
Feature Importance: [0.04659854 0.11439081 0.08344424 0.21859221 0.00991015 0.03528416
 0.01159328 0.03701961 0.15205887 0.01924347 0.27186468]
Fold 1: MAPE = 0.10545309021300159 , RMSE = 3.0421396183907508, R2 = 0.8755955004980762
Fold 2: MAPE = 0.10750437335777888 , RMSE = 3.2860446360763174, R2 = 0.8950531816683889
Fold 3: MAPE = 0.12038328133909917 , RMSE = 3.2715458727742623, R2 = 0.8979113051149769
Fold 4: MAPE = 0.12139403663818639 , RMSE = 3.5858398835693

Feature Importance: [0.04659854 0.11439081 0.08344424 0.21859221 0.00991015 0.03528416
 0.01159328 0.03701961 0.15205887 0.01924347 0.27186468]
Fold 1: MAPE = 0.13219042655037125 , RMSE = 4.47995033013453, R2 = 0.8329806356460367
Fold 2: MAPE = 0.09744103994197309 , RMSE = 2.9071902333258284, R2 = 0.894826889848047
Fold 3: MAPE = 0.1100805762363598 , RMSE = 2.5711641034641364, R2 = 0.9106298845698111
Fold 4: MAPE = 0.10859560176243233 , RMSE = 3.0782779237304054, R2 = 0.8715456019028222
Fold 5: MAPE = 0.10483418987153445 , RMSE = 3.3369814169321574, R2 = 0.8431469894073857
Average MAPE: 0.11062836687253419
Average RMSE: 3.274712801517411
Average R2: 0.8706260002748205
Feature Importance: [0.04659854 0.11439081 0.08344424 0.21859221 0.00991015 0.03528416
 0.01159328 0.03701961 0.15205887 0.01924347 0.27186468]
Fold 1: MAPE = 0.10617644273682282 , RMSE = 2.9643038338541965, R2 = 0.8986138765197785
Fold 2: MAPE = 0.10052436573040614 , RMSE = 2.9345101581079, R2 = 0.8813531613094064
Fold 3

In [12]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from pandas import read_csv
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load Boston Housing dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv('housing.xls', header=None, delimiter=r"\s+", names=column_names)
data = data.drop(columns=['CHAS','CRIM'])
X = data.drop(columns=['MEDV'])
y = data['MEDV']

# Initialize XGBoost model
# model = xgb.XGBRegressor()

model = xgb.XGBRegressor(objective='reg:linear', colsample_bytree=0.3, learning_rate=0.1, max_depth=10, alpha=10, n_estimators=500, reg_lambda=2) #  0.2259856363283106

avg_r2 = []

for idx in range(100):

    # Define K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)

    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict
        predictions = model.predict(X_test)

        # Calculate MAPE
        mape = np.mean(np.abs((y_test-predictions)/y_test))
        mape_scores.append(mape)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        # Calculate R2
        r2 = r2_score(y_test, predictions)
        r2_scores.append(r2)

    # Output each fold's performance 
    for i in range(5):
        print(f"Fold {i+1}: MAPE = {mape_scores[i]} , RMSE = {rmse_scores[i]}, R2 = {r2_scores[i]}")
    # Output the average performance for 5 folds
    print(f"Average MAPE: {np.mean(mape_scores)}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print(f"Average R2: {np.mean(r2_scores)}")
    avg_r2.append(np.mean(r2_scores)) 

    # Calculate feature importance
    model.fit(X, y)
    feature_importance = model.feature_importances_
    print("Feature Importance:", feature_importance)

print(f"Average 100 R2: {np.mean(avg_r2)}")
# drop CHAS and CRIM R2 = 0.9300
# Average 100 R2: 0.8678

Fold 1: MAPE = 0.10916700258206513 , RMSE = 3.13118222596326, R2 = 0.8614063099566334
Fold 2: MAPE = 0.12021210865301092 , RMSE = 3.5194562308482005, R2 = 0.8730151439319418
Fold 3: MAPE = 0.13404799379058926 , RMSE = 4.0401978663707, R2 = 0.8505070541419826
Fold 4: MAPE = 0.09559355860116446 , RMSE = 3.2841482663272132, R2 = 0.8521186334291578
Fold 5: MAPE = 0.11200086035774129 , RMSE = 2.7145524262635523, R2 = 0.8937157832669019
Average MAPE: 0.11420430479691421
Average RMSE: 3.3379074031545857
Average R2: 0.8661525849453235
Feature Importance: [0.00809261 0.10909961 0.09735322 0.20609891 0.01103021 0.03866891
 0.02214415 0.04932115 0.15224355 0.02714925 0.27879837]
Fold 1: MAPE = 0.11636847070035163 , RMSE = 3.02976010809641, R2 = 0.891283439552854
Fold 2: MAPE = 0.10973533224488556 , RMSE = 3.2516227551780545, R2 = 0.8429603644145761
Fold 3: MAPE = 0.12801929179959112 , RMSE = 3.2998122735068365, R2 = 0.8741910899628385
Fold 4: MAPE = 0.12112544967268052 , RMSE = 3.179937960833753,

Feature Importance: [0.00809261 0.10909961 0.09735322 0.20609891 0.01103021 0.03866891
 0.02214415 0.04932115 0.15224355 0.02714925 0.27879837]
Fold 1: MAPE = 0.11505732363201715 , RMSE = 3.824661345330227, R2 = 0.8229648940851022
Fold 2: MAPE = 0.11692774178891774 , RMSE = 3.9913892385456284, R2 = 0.8456240604808148
Fold 3: MAPE = 0.12197448378020966 , RMSE = 3.2935405386303107, R2 = 0.8836703792244418
Fold 4: MAPE = 0.10059182635919303 , RMSE = 2.7911102634703155, R2 = 0.9105915556289187
Fold 5: MAPE = 0.10804896213326841 , RMSE = 3.1053440509891757, R2 = 0.8043501417367799
Average MAPE: 0.1125200675387212
Average RMSE: 3.4012090873931315
Average R2: 0.8534402062312114
Feature Importance: [0.00809261 0.10909961 0.09735322 0.20609891 0.01103021 0.03866891
 0.02214415 0.04932115 0.15224355 0.02714925 0.27879837]
Fold 1: MAPE = 0.12047774380627554 , RMSE = 3.119473111396288, R2 = 0.8874133527317307
Fold 2: MAPE = 0.1103436258677174 , RMSE = 2.6270755153460734, R2 = 0.9048425983909476
Fo

Fold 1: MAPE = 0.10307139158860991 , RMSE = 3.1652804782268666, R2 = 0.8563345024201408
Fold 2: MAPE = 0.11819702350396598 , RMSE = 3.3893163444370864, R2 = 0.8410767090064262
Fold 3: MAPE = 0.1128017260263049 , RMSE = 3.7752443763324197, R2 = 0.8592119044833122
Fold 4: MAPE = 0.11854158497132192 , RMSE = 3.021706564000726, R2 = 0.9052527201544487
Fold 5: MAPE = 0.1068394346952987 , RMSE = 2.848116814261525, R2 = 0.8966165595148343
Average MAPE: 0.11189023215710028
Average RMSE: 3.2399329154517247
Average R2: 0.8716984791158324
Feature Importance: [0.00809261 0.10909961 0.09735322 0.20609891 0.01103021 0.03866891
 0.02214415 0.04932115 0.15224355 0.02714925 0.27879837]
Fold 1: MAPE = 0.11532148967898155 , RMSE = 2.959776830014147, R2 = 0.8633078095968811
Fold 2: MAPE = 0.12363476891911396 , RMSE = 3.026002091856807, R2 = 0.8951215229695111
Fold 3: MAPE = 0.13461201370580209 , RMSE = 3.9898635286783577, R2 = 0.8387652419268056
Fold 4: MAPE = 0.0931757040716232 , RMSE = 3.618832198349542

Feature Importance: [0.00809261 0.10909961 0.09735322 0.20609891 0.01103021 0.03866891
 0.02214415 0.04932115 0.15224355 0.02714925 0.27879837]
Fold 1: MAPE = 0.10907849859325798 , RMSE = 3.265526009874832, R2 = 0.8529861024896863
Fold 2: MAPE = 0.13837737234952718 , RMSE = 3.819999952858565, R2 = 0.8408695965489102
Fold 3: MAPE = 0.11495394302001744 , RMSE = 3.1058799222649345, R2 = 0.8794726981989349
Fold 4: MAPE = 0.12744107775871613 , RMSE = 3.2041455292224277, R2 = 0.8775640759149507
Fold 5: MAPE = 0.08958846715742218 , RMSE = 3.550853952410278, R2 = 0.8591154149237046
Average MAPE: 0.11588787177578819
Average RMSE: 3.3892810733262073
Average R2: 0.8620015776152373
Feature Importance: [0.00809261 0.10909961 0.09735322 0.20609891 0.01103021 0.03866891
 0.02214415 0.04932115 0.15224355 0.02714925 0.27879837]
Fold 1: MAPE = 0.12489932744234174 , RMSE = 4.691010372841143, R2 = 0.7906815759720764
Fold 2: MAPE = 0.11112040999849228 , RMSE = 3.422615483990608, R2 = 0.8051587950023159
Fol

Fold 1: MAPE = 0.11694648882108916 , RMSE = 3.2803067937898414, R2 = 0.8846309508862062
Fold 2: MAPE = 0.11116157138735581 , RMSE = 3.163201369113111, R2 = 0.8570874584764381
Fold 3: MAPE = 0.10927578441769323 , RMSE = 2.73982065260745, R2 = 0.9003448592974315
Fold 4: MAPE = 0.11775801304122491 , RMSE = 2.91975146134993, R2 = 0.9044050801930186
Fold 5: MAPE = 0.10412964031476833 , RMSE = 3.9496828423880403, R2 = 0.8320697502000931
Average MAPE: 0.1118542995964263
Average RMSE: 3.210552623849675
Average R2: 0.8757076198106375
Feature Importance: [0.00809261 0.10909961 0.09735322 0.20609891 0.01103021 0.03866891
 0.02214415 0.04932115 0.15224355 0.02714925 0.27879837]
Fold 1: MAPE = 0.10255945078943186 , RMSE = 3.934123598775506, R2 = 0.8385274237034128
Fold 2: MAPE = 0.11498106567813579 , RMSE = 2.8727635559062836, R2 = 0.8775458025291659
Fold 3: MAPE = 0.11458391057067885 , RMSE = 3.0607548551250345, R2 = 0.8688690751827518
Fold 4: MAPE = 0.11059197171695882 , RMSE = 3.3425922246709643

Feature Importance: [0.00809261 0.10909961 0.09735322 0.20609891 0.01103021 0.03866891
 0.02214415 0.04932115 0.15224355 0.02714925 0.27879837]
Fold 1: MAPE = 0.09521692407741432 , RMSE = 2.9703688427704154, R2 = 0.8889344257928208
Fold 2: MAPE = 0.10172852142880769 , RMSE = 2.7898304141980144, R2 = 0.9123786602711678
Fold 3: MAPE = 0.11274788219391874 , RMSE = 3.0792715634642835, R2 = 0.8779399336991776
Fold 4: MAPE = 0.1190529452729989 , RMSE = 2.9707454716807726, R2 = 0.8908959404585126
Fold 5: MAPE = 0.12437886712641408 , RMSE = 4.16250642585457, R2 = 0.8138800437074382
Average MAPE: 0.11062502801991074
Average RMSE: 3.1945445435936106
Average R2: 0.8768058007858233
Feature Importance: [0.00809261 0.10909961 0.09735322 0.20609891 0.01103021 0.03866891
 0.02214415 0.04932115 0.15224355 0.02714925 0.27879837]
Fold 1: MAPE = 0.10820883986060556 , RMSE = 3.6195442814416707, R2 = 0.8201768808168178
Fold 2: MAPE = 0.12383191861917658 , RMSE = 3.626672324744178, R2 = 0.8580953313007308
Fo

Fold 1: MAPE = 0.1221518651044979 , RMSE = 3.085039999019599, R2 = 0.8826470734785272
Fold 2: MAPE = 0.1171329430426417 , RMSE = 3.3893817950409004, R2 = 0.8466407726534201
Fold 3: MAPE = 0.13349991031456773 , RMSE = 4.287292892018203, R2 = 0.8246477148114157
Fold 4: MAPE = 0.08387383071294603 , RMSE = 2.377803587330771, R2 = 0.9153379834039688
Fold 5: MAPE = 0.11322519548465462 , RMSE = 3.0029948403054383, R2 = 0.9021945054441105
Average MAPE: 0.1139767489318616
Average RMSE: 3.2285026227429823
Average R2: 0.8742936099582884
Feature Importance: [0.00809261 0.10909961 0.09735322 0.20609891 0.01103021 0.03866891
 0.02214415 0.04932115 0.15224355 0.02714925 0.27879837]
Fold 1: MAPE = 0.11438600217124172 , RMSE = 3.4426308790212627, R2 = 0.8700852303511979
Fold 2: MAPE = 0.11414415982152348 , RMSE = 4.357310393596626, R2 = 0.8097765762394842
Fold 3: MAPE = 0.11184199891972028 , RMSE = 2.963014664770706, R2 = 0.9090756355561186
Fold 4: MAPE = 0.10751991450786129 , RMSE = 2.7313504272192493

Feature Importance: [0.00809261 0.10909961 0.09735322 0.20609891 0.01103021 0.03866891
 0.02214415 0.04932115 0.15224355 0.02714925 0.27879837]
Fold 1: MAPE = 0.12281978960897431 , RMSE = 3.320368870368577, R2 = 0.8678949711456012
Fold 2: MAPE = 0.11250971659528208 , RMSE = 2.9552830738317177, R2 = 0.8758454459929825
Fold 3: MAPE = 0.10196034193759593 , RMSE = 3.7239290019580538, R2 = 0.882723734332818
Fold 4: MAPE = 0.11314139302931935 , RMSE = 3.436126427094591, R2 = 0.8709545207950984
Fold 5: MAPE = 0.1201902312335703 , RMSE = 3.1704930351646357, R2 = 0.7984229485794851
Average MAPE: 0.11412429448094841
Average RMSE: 3.321240081683515
Average R2: 0.8591683241691971
Feature Importance: [0.00809261 0.10909961 0.09735322 0.20609891 0.01103021 0.03866891
 0.02214415 0.04932115 0.15224355 0.02714925 0.27879837]
Fold 1: MAPE = 0.11347860016178203 , RMSE = 3.05314418403131, R2 = 0.8214206546140211
Fold 2: MAPE = 0.13180473772742313 , RMSE = 4.186755904838182, R2 = 0.8385830559708771
Fold 3

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from pandas import read_csv
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load Boston Housing dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv('housing.xls', header=None, delimiter=r"\s+", names=column_names)
data = data.drop(columns=['CHAS','CRIM'])
X = data.drop(columns=['MEDV'])
y = data['MEDV']

# Initialize XGBoost model
# model = xgb.XGBRegressor()

model = xgb.XGBRegressor(objective='reg:linear', colsample_bytree=0.3, learning_rate=0.1, max_depth=10, alpha=10, n_estimators=500, reg_lambda=2) #  0.2259856363283106

avg_r2 = []

for idx in range(100):

    # Define K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)

    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict
        predictions = model.predict(X_test)

        # Calculate MAPE
        mape = np.mean(np.abs((y_test-predictions)/y_test))
        mape_scores.append(mape)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        # Calculate R2
        r2 = r2_score(y_test, predictions)
        r2_scores.append(r2)

    # Output each fold's performance 
    for i in range(5):
        print(f"Fold {i+1}: MAPE = {mape_scores[i]} , RMSE = {rmse_scores[i]}, R2 = {r2_scores[i]}")
    # Output the average performance for 5 folds
    print(f"Average MAPE: {np.mean(mape_scores)}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print(f"Average R2: {np.mean(r2_scores)}")
    avg_r2.append(np.mean(r2_scores)) 

    # Calculate feature importance
    model.fit(X, y)
    feature_importance = model.feature_importances_
    print("Feature Importance:", feature_importance)

print(f"Average 100 R2: {np.mean(avg_r2)}")
# drop CHAS and CRIM R2 = 0.9378
# Average 100 R2: 0.8678