In [18]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from pandas import read_csv
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Load Boston Housing dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv('housing.xls', header=None, delimiter=r"\s+", names=column_names)
X = data.drop(columns=['MEDV'])
y = data['MEDV']

# Initialize XGBoost model
# model = XGBRegressor()

# parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
#               'objective':['reg:linear'],
#               'learning_rate': [.03, 0.05, .07], #so called `eta` value
#               'max_depth': [5, 6, 7],
#               'min_child_weight': [4],
#               'silent': [1],
#               'subsample': [0.7],
#               'colsample_bytree': [0.7],
#               'n_estimators': [500]}

# model = GridSearchCV(model,
#                         parameters,
#                         cv = 2,
#                         n_jobs = 5,
#                         verbose=True)

# model.fit(X_train,y_train)

# print(model.best_score_)
# print(model.best_params_)

# {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
params = {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
model = xgb.XGBRegressor(**params) 

avg_r2 = []

for idx in range(100):

    # Define K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)

    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict
        predictions = model.predict(X_test)

        # Calculate MAPE
        mape = np.mean(np.abs((y_test-predictions)/y_test))
        mape_scores.append(mape)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        # Calculate R2
        r2 = r2_score(y_test, predictions)
        r2_scores.append(r2)

    # Output each fold's performance 
    for i in range(5):
        print(f"Fold {i+1}: MAPE = {mape_scores[i]} , RMSE = {rmse_scores[i]}, R2 = {r2_scores[i]}")
    # Output the average performance for 5 folds
    print(f"Average MAPE: {np.mean(mape_scores)}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print(f"Average R2: {np.mean(r2_scores)}")
    avg_r2.append(np.mean(r2_scores)) 

    # Calculate feature importance
    model.fit(X, y)
    feature_importance = model.feature_importances_
    print("Feature Importance:", feature_importance)

print(f"Average 100 R2: {np.mean(avg_r2)}")


# Average 100 R2: 0.8792

Fold 1: MAPE = 0.0905196841954225 , RMSE = 2.4093693911426075, R2 = 0.9220904429803096
Fold 2: MAPE = 0.09790130593329792 , RMSE = 2.7222719334927077, R2 = 0.919779165897826
Fold 3: MAPE = 0.12190034226629222 , RMSE = 4.415353953733743, R2 = 0.8023729008484062
Fold 4: MAPE = 0.10228603203404325 , RMSE = 2.4967736074507094, R2 = 0.8839089489993601
Fold 5: MAPE = 0.10051664590809928 , RMSE = 3.8191558176393103, R2 = 0.8478911264248795
Average MAPE: 0.10262480206743103
Average RMSE: 3.1725849406918156
Average R2: 0.8752085170301562
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.09320372904284686 , RMSE = 3.149729833023455, R2 = 0.8582637910038731
Fold 2: MAPE = 0.09360883851648055 , RMSE = 2.3522121804769913, R2 = 0.9354381033148687
Fold 3: MAPE = 0.12443150966546102 , RMSE = 3.6122042985410454, R2 = 0.8519513902170126
Fold 4: MAPE = 0.12161494337152624 

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.09329725002803488 , RMSE = 2.6020351768662975, R2 = 0.9044462304925838
Fold 2: MAPE = 0.09367177566616725 , RMSE = 2.5927817204821824, R2 = 0.903332826768103
Fold 3: MAPE = 0.09577522390756482 , RMSE = 2.7763081552495175, R2 = 0.8950403961649858
Fold 4: MAPE = 0.12378467257777843 , RMSE = 3.998721129502024, R2 = 0.8522542180172015
Fold 5: MAPE = 0.11483062848549958 , RMSE = 3.1802653113961386, R2 = 0.8973216620726476
Average MAPE: 0.10427191013300899
Average RMSE: 3.030022298699232
Average R2: 0.8904790667031044
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.08229646027641285 , RMSE = 2.7069814179231084, R2 = 0.930402699403494
Fold 2: MAPE = 0.1402047970397308 , RMSE = 3

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.10663099533154127 , RMSE = 3.294542936477248, R2 = 0.8884231638833219
Fold 2: MAPE = 0.11561611521675891 , RMSE = 3.3011725015329607, R2 = 0.8824030303205349
Fold 3: MAPE = 0.09537212509661719 , RMSE = 3.578262198955731, R2 = 0.8243289550947678
Fold 4: MAPE = 0.09555376302727692 , RMSE = 2.5167207291924774, R2 = 0.9190669107298801
Fold 5: MAPE = 0.10715536025553138 , RMSE = 2.7170279887832134, R2 = 0.905323847102499
Average MAPE: 0.10406567178554513
Average RMSE: 3.081545270988326
Average R2: 0.8839091814262009
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.11598525713967582 , RMSE = 3.352524838536883, R2 = 0.8874866489380007
Fold 2: MAPE = 0.08980811572660072 , RMSE = 3

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.12471098062366086 , RMSE = 3.9417830263314446, R2 = 0.8265849539912885
Fold 2: MAPE = 0.10531735304552568 , RMSE = 3.2132235479142692, R2 = 0.9053518968020716
Fold 3: MAPE = 0.09102255545149149 , RMSE = 2.564457759636557, R2 = 0.9223011465371902
Fold 4: MAPE = 0.10926407106714583 , RMSE = 2.515983318258804, R2 = 0.9140320904531615
Fold 5: MAPE = 0.09154850761244067 , RMSE = 2.8437465702510356, R2 = 0.8677399566320175
Average MAPE: 0.10437269356005292
Average RMSE: 3.0158388444784223
Average R2: 0.8872020088831458
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.10897072585013874 , RMSE = 3.5531264246384997, R2 = 0.8818343364834496
Fold 2: MAPE = 0.09687138237294228 , RMSE 

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.09950264473282783 , RMSE = 3.393126030633141, R2 = 0.8912105967483165
Fold 2: MAPE = 0.12092530535141668 , RMSE = 3.7086459391055944, R2 = 0.8422987034149587
Fold 3: MAPE = 0.09107576094368222 , RMSE = 2.7356438511146486, R2 = 0.89728421993828
Fold 4: MAPE = 0.10491403799196447 , RMSE = 3.002665240920507, R2 = 0.8655709238859981
Fold 5: MAPE = 0.09671067129586068 , RMSE = 2.717068256056987, R2 = 0.9146630140326173
Average MAPE: 0.10262568406315038
Average RMSE: 3.1114298635661752
Average R2: 0.882205491604034
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.12989263322139483 , RMSE = 3.565376317491304, R2 = 0.8635807877976309
Fold 2: MAPE = 0.0978657832011524 , RMSE = 3.35

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.09748383612070499 , RMSE = 2.518941701507674, R2 = 0.9101716083742647
Fold 2: MAPE = 0.11096060253394036 , RMSE = 3.5018545852735614, R2 = 0.8623581993473375
Fold 3: MAPE = 0.09975416556689749 , RMSE = 2.6760878285552088, R2 = 0.9205890654913843
Fold 4: MAPE = 0.09946572367277529 , RMSE = 3.5394809885654, R2 = 0.8401572742913467
Fold 5: MAPE = 0.13962114217659294 , RMSE = 3.1627744573306105, R2 = 0.8920059166154232
Average MAPE: 0.10945709401418222
Average RMSE: 3.079827912246491
Average R2: 0.8850564128239512
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.12032741149554348 , RMSE = 3.3377028253948398, R2 = 0.8782609747843265
Fold 2: MAPE = 0.11459124685985389 , RMSE = 3

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.08153563175225272 , RMSE = 4.146272808069097, R2 = 0.7955796259059812
Fold 2: MAPE = 0.11498679121783979 , RMSE = 2.9408775992412832, R2 = 0.8940897035952747
Fold 3: MAPE = 0.10523365967997854 , RMSE = 2.862544235000701, R2 = 0.8807577815606834
Fold 4: MAPE = 0.09181362547651543 , RMSE = 2.898127563437027, R2 = 0.9231870672709137
Fold 5: MAPE = 0.12081512669984745 , RMSE = 3.7194638921755474, R2 = 0.798928170556841
Average MAPE: 0.10287696696528678
Average RMSE: 3.3134572195847314
Average R2: 0.8585084697779388
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.11326783420591299 , RMSE = 2.6636682112222188, R2 = 0.9121937154091846
Fold 2: MAPE = 0.08663638157831015 , RMSE = 

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.09954873256406045 , RMSE = 3.167475169486289, R2 = 0.9008889419451156
Fold 2: MAPE = 0.09605656949584558 , RMSE = 2.4827783381997857, R2 = 0.9168522109966795
Fold 3: MAPE = 0.10207668271161577 , RMSE = 2.9951174251852817, R2 = 0.902013545571709
Fold 4: MAPE = 0.11688881463235692 , RMSE = 3.2503504900954723, R2 = 0.8415925424301964
Fold 5: MAPE = 0.107039266502869 , RMSE = 3.7080041413363465, R2 = 0.8410544919302965
Average MAPE: 0.10432201318134955
Average RMSE: 3.120745112860635
Average R2: 0.8804803465747992
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.10411047063431138 , RMSE = 2.5106959346749513, R2 = 0.9190587380144328
Fold 2: MAPE = 0.11762379375973676 , RMSE = 4

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.0983767513840425 , RMSE = 3.330895667846088, R2 = 0.8142984480408999
Fold 2: MAPE = 0.1154263245429055 , RMSE = 3.3543294467622, R2 = 0.8901555943430642
Fold 3: MAPE = 0.11376067264339461 , RMSE = 3.8298185451033144, R2 = 0.8602200425766082
Fold 4: MAPE = 0.08317184518483765 , RMSE = 2.317645553153128, R2 = 0.9294905869022758
Fold 5: MAPE = 0.11409570892142926 , RMSE = 2.3900008999458837, R2 = 0.9248865522338445
Average MAPE: 0.1049662605353219
Average RMSE: 3.044538022562123
Average R2: 0.8838102448193383
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.10386183286674759 , RMSE = 2.6993995958107195, R2 = 0.9199915967091961
Fold 2: MAPE = 0.09906034286370125 , RMSE = 3.357

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from pandas import read_csv
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Load Boston Housing dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv('housing.xls', header=None, delimiter=r"\s+", names=column_names)
data = data.drop(columns=['CHAS'])
X = data.drop(columns=['MEDV'])
y = data['MEDV']

# Initialize XGBoost model
# model = XGBRegressor()

# parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
#               'objective':['reg:linear'],
#               'learning_rate': [.03, 0.05, .07], #so called `eta` value
#               'max_depth': [5, 6, 7],
#               'min_child_weight': [4],
#               'silent': [1],
#               'subsample': [0.7],
#               'colsample_bytree': [0.7],
#               'n_estimators': [500]}

# model = GridSearchCV(model,
#                         parameters,
#                         cv = 2,
#                         n_jobs = 5,
#                         verbose=True)

# model.fit(X_train,y_train)

# print(model.best_score_)
# print(model.best_params_)

# {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
params = {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
model = xgb.XGBRegressor(**params) 

avg_r2 = []

for idx in range(100):

    # Define K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)

    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict
        predictions = model.predict(X_test)

        # Calculate MAPE
        mape = np.mean(np.abs((y_test-predictions)/y_test))
        mape_scores.append(mape)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        # Calculate R2
        r2 = r2_score(y_test, predictions)
        r2_scores.append(r2)

    # Output each fold's performance 
    for i in range(5):
        print(f"Fold {i+1}: MAPE = {mape_scores[i]} , RMSE = {rmse_scores[i]}, R2 = {r2_scores[i]}")
    # Output the average performance for 5 folds
    print(f"Average MAPE: {np.mean(mape_scores)}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print(f"Average R2: {np.mean(r2_scores)}")
    avg_r2.append(np.mean(r2_scores)) 

    # Calculate feature importance
    model.fit(X, y)
    feature_importance = model.feature_importances_
    print("Feature Importance:", feature_importance)

print(f"Average 100 R2: {np.mean(avg_r2)}")

# drop chas R2 = 0.95524
# Average 100 R2: 0.8810

Fold 1: MAPE = 0.1047243101481121 , RMSE = 3.354255229810871, R2 = 0.8558848862300616
Fold 2: MAPE = 0.09063061965429506 , RMSE = 2.6435734719125548, R2 = 0.9121213865687592
Fold 3: MAPE = 0.10392325258611575 , RMSE = 3.170130037676745, R2 = 0.8835499608050605
Fold 4: MAPE = 0.09370351718754993 , RMSE = 2.8120619557494027, R2 = 0.9222199760750638
Fold 5: MAPE = 0.11992039222283207 , RMSE = 3.0503917678095696, R2 = 0.8750418048155592
Average MAPE: 0.10258041835978098
Average RMSE: 3.006082492591829
Average R2: 0.8897636028989009
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.1203890634607282 , RMSE = 3.1800414390870055, R2 = 0.9071675601691888
Fold 2: MAPE = 0.10454897247775406 , RMSE = 2.4622718923048605, R2 = 0.9234343415875925
Fold 3: MAPE = 0.08690508267799772 , RMSE = 2.706857957572746, R2 = 0.9281496464587564
Fold 4: MAPE = 0.11175011346470759 , RMSE = 4.028

Fold 1: MAPE = 0.10064476144921557 , RMSE = 2.8624204903433186, R2 = 0.9141603643104631
Fold 2: MAPE = 0.10820638861388011 , RMSE = 2.615224997509031, R2 = 0.901307986871323
Fold 3: MAPE = 0.09223000212298693 , RMSE = 3.4973495487878257, R2 = 0.8519061750165523
Fold 4: MAPE = 0.10792264716668792 , RMSE = 3.351655314632776, R2 = 0.8430228355549746
Fold 5: MAPE = 0.10897512795287664 , RMSE = 3.085397887606512, R2 = 0.9053267554367155
Average MAPE: 0.10359578546112944
Average RMSE: 3.0824096477758927
Average R2: 0.8831448234380057
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.09411662645038021 , RMSE = 3.472429603045653, R2 = 0.854194899278143
Fold 2: MAPE = 0.10151283541276039 , RMSE = 2.526215286201172, R2 = 0.904293223865864
Fold 3: MAPE = 0.07709211851421395 , RMSE = 2.058142013586174, R2 = 0.9502867475650036
Fold 4: MAPE = 0.11806498174548952 , RMSE = 2.989860

Fold 1: MAPE = 0.11183266141662156 , RMSE = 3.708330390140217, R2 = 0.8480635394303835
Fold 2: MAPE = 0.11362801810113785 , RMSE = 3.2352554515087704, R2 = 0.8465653044250817
Fold 3: MAPE = 0.1019632654325425 , RMSE = 2.671728974630636, R2 = 0.9159394121802927
Fold 4: MAPE = 0.11560701067458069 , RMSE = 3.8406766690946483, R2 = 0.8467966939571147
Fold 5: MAPE = 0.1102455017842111 , RMSE = 2.73317337907495, R2 = 0.9063152304963334
Average MAPE: 0.11065529148181874
Average RMSE: 3.2378329728898443
Average R2: 0.8727360360978412
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.10503818863481965 , RMSE = 3.231742811185211, R2 = 0.8851642084459197
Fold 2: MAPE = 0.09242420944880114 , RMSE = 2.717678313818363, R2 = 0.9155826451621295
Fold 3: MAPE = 0.11424653262947386 , RMSE = 3.768871324496607, R2 = 0.7853278154307272
Fold 4: MAPE = 0.08901935871165405 , RMSE = 2.361776

Fold 1: MAPE = 0.08223662413674371 , RMSE = 1.9956718244875538, R2 = 0.9183456475513454
Fold 2: MAPE = 0.10529247913366503 , RMSE = 3.572347180789607, R2 = 0.8679068634046608
Fold 3: MAPE = 0.1263294947432672 , RMSE = 3.4748254761499444, R2 = 0.8595081803608586
Fold 4: MAPE = 0.10638168611611865 , RMSE = 2.729926534479521, R2 = 0.9045054270747661
Fold 5: MAPE = 0.10248334682684948 , RMSE = 4.144176828660544, R2 = 0.8420494244632178
Average MAPE: 0.10454472619132882
Average RMSE: 3.1833895689134337
Average R2: 0.8784631085709698
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.1123997470829303 , RMSE = 3.6354758583778137, R2 = 0.8479645983890196
Fold 2: MAPE = 0.12033618123530823 , RMSE = 3.5072546090156873, R2 = 0.8647111852329202
Fold 3: MAPE = 0.10076641423461495 , RMSE = 3.0004367547129434, R2 = 0.9007070561694954
Fold 4: MAPE = 0.09517725978651875 , RMSE = 2.47

Fold 1: MAPE = 0.1328733616972028 , RMSE = 3.566145073729316, R2 = 0.8362028600005766
Fold 2: MAPE = 0.097300494818348 , RMSE = 2.5234594062092945, R2 = 0.9150306202416851
Fold 3: MAPE = 0.09743320549702404 , RMSE = 2.3159035052444144, R2 = 0.9362416676330846
Fold 4: MAPE = 0.10943730666710039 , RMSE = 4.671386943566666, R2 = 0.8074286682342301
Fold 5: MAPE = 0.0967446258120274 , RMSE = 2.5550497374721957, R2 = 0.9048884053719202
Average MAPE: 0.10675779889834051
Average RMSE: 3.1263889332443773
Average R2: 0.8799584442962992
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.12365138639897222 , RMSE = 3.7445794610743777, R2 = 0.8545033674385794
Fold 2: MAPE = 0.10066987793926938 , RMSE = 4.10962505076745, R2 = 0.8222544346423131
Fold 3: MAPE = 0.10488676504790553 , RMSE = 2.5717974848346103, R2 = 0.9200055353598864
Fold 4: MAPE = 0.10069751768440242 , RMSE = 2.94474

Fold 1: MAPE = 0.09504036314492098 , RMSE = 3.307377587269813, R2 = 0.9029208267336678
Fold 2: MAPE = 0.09441531441043541 , RMSE = 2.5403128072306247, R2 = 0.8953513592170839
Fold 3: MAPE = 0.0984066137999177 , RMSE = 2.8665212612679025, R2 = 0.9009226616291941
Fold 4: MAPE = 0.09186922944320543 , RMSE = 2.4256507566412737, R2 = 0.9308010618599266
Fold 5: MAPE = 0.12247661956003196 , RMSE = 3.2062477105751115, R2 = 0.8635418551213685
Average MAPE: 0.10044162807170229
Average RMSE: 2.8692220245969446
Average R2: 0.8987075529122481
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.10230069981315348 , RMSE = 2.800234207385616, R2 = 0.9138957554370335
Fold 2: MAPE = 0.10098962173388354 , RMSE = 2.67854081186377, R2 = 0.900871336613676
Fold 3: MAPE = 0.12164522660734861 , RMSE = 4.569868712373516, R2 = 0.7353736211659812
Fold 4: MAPE = 0.1130461840870342 , RMSE = 2.77452

Fold 1: MAPE = 0.1083059634752058 , RMSE = 3.8649025383988387, R2 = 0.8454626565556019
Fold 2: MAPE = 0.08994975180536947 , RMSE = 2.95717758732495, R2 = 0.8920569903122533
Fold 3: MAPE = 0.10130873573434053 , RMSE = 2.5241267360381663, R2 = 0.9317219086639648
Fold 4: MAPE = 0.09766788465141164 , RMSE = 2.3657291652163797, R2 = 0.8804121368111424
Fold 5: MAPE = 0.11557104638289326 , RMSE = 3.267293374294374, R2 = 0.8927989018703495
Average MAPE: 0.10256067640984415
Average RMSE: 2.9958458802545413
Average R2: 0.8884905188426625
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.10176672803939046 , RMSE = 2.556606872879033, R2 = 0.9258183608157193
Fold 2: MAPE = 0.11291000392988106 , RMSE = 4.520816036959869, R2 = 0.7712833255743736
Fold 3: MAPE = 0.10503644611847163 , RMSE = 3.596450508468048, R2 = 0.8455363101960969
Fold 4: MAPE = 0.08925501440934616 , RMSE = 2.6076

Fold 1: MAPE = 0.11904989750456133 , RMSE = 2.971533472628774, R2 = 0.909129716033438
Fold 2: MAPE = 0.10029679083088144 , RMSE = 2.5286307839400166, R2 = 0.9366008043441807
Fold 3: MAPE = 0.1010348589445203 , RMSE = 3.702160795858636, R2 = 0.8034932677308131
Fold 4: MAPE = 0.10206302114628037 , RMSE = 3.1393482167144224, R2 = 0.8746877237701928
Fold 5: MAPE = 0.09636645352420796 , RMSE = 2.978369656571051, R2 = 0.8798674932639544
Average MAPE: 0.10376220439009028
Average RMSE: 3.06400858514258
Average R2: 0.8807558010285158
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.12409834399284601 , RMSE = 3.4881111246482597, R2 = 0.8828720056163688
Fold 2: MAPE = 0.1028139902418405 , RMSE = 3.2989601840676084, R2 = 0.8888188172930838
Fold 3: MAPE = 0.09637607989414541 , RMSE = 3.1169367519117523, R2 = 0.8486555826969153
Fold 4: MAPE = 0.094749942149687 , RMSE = 2.5226246

Fold 1: MAPE = 0.09353979817641637 , RMSE = 2.84831666626661, R2 = 0.9016601891495732
Fold 2: MAPE = 0.09778176277438752 , RMSE = 2.886839517337642, R2 = 0.9125765429355454
Fold 3: MAPE = 0.10381232887658169 , RMSE = 3.4350804082259003, R2 = 0.8686388729909176
Fold 4: MAPE = 0.14142964455214554 , RMSE = 3.7021859908161914, R2 = 0.8324293952401767
Fold 5: MAPE = 0.10639459969947179 , RMSE = 2.7244019363107848, R2 = 0.8941620001691315
Average MAPE: 0.10859162681580056
Average RMSE: 3.1193649037914257
Average R2: 0.8818934000970688
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.11132418204837094 , RMSE = 3.3809389598796074, R2 = 0.8629841831853401
Fold 2: MAPE = 0.12312847572081272 , RMSE = 3.704212112650527, R2 = 0.8700993104187645
Fold 3: MAPE = 0.1042129368151684 , RMSE = 2.6836147439840725, R2 = 0.893404214211589
Fold 4: MAPE = 0.1167101757876815 , RMSE = 2.6467

In [2]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from pandas import read_csv
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Load Boston Housing dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv('housing.xls', header=None, delimiter=r"\s+", names=column_names)
data = data.drop(columns=['CHAS','ZN'])
X = data.drop(columns=['MEDV'])
y = data['MEDV']

# Initialize XGBoost model
# model = XGBRegressor()

# parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
#               'objective':['reg:linear'],
#               'learning_rate': [.03, 0.05, .07], #so called `eta` value
#               'max_depth': [5, 6, 7],
#               'min_child_weight': [4],
#               'silent': [1],
#               'subsample': [0.7],
#               'colsample_bytree': [0.7],
#               'n_estimators': [500]}

# model = GridSearchCV(model,
#                         parameters,
#                         cv = 2,
#                         n_jobs = 5,
#                         verbose=True)

# model.fit(X_train,y_train)

# print(model.best_score_)
# print(model.best_params_)

# {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
params = {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
model = xgb.XGBRegressor(**params) 

avg_r2 = []

for idx in range(100):

    # Define K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)

    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict
        predictions = model.predict(X_test)

        # Calculate MAPE
        mape = np.mean(np.abs((y_test-predictions)/y_test))
        mape_scores.append(mape)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        # Calculate R2
        r2 = r2_score(y_test, predictions)
        r2_scores.append(r2)

    # Output each fold's performance 
    for i in range(5):
        print(f"Fold {i+1}: MAPE = {mape_scores[i]} , RMSE = {rmse_scores[i]}, R2 = {r2_scores[i]}")
    # Output the average performance for 5 folds
    print(f"Average MAPE: {np.mean(mape_scores)}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print(f"Average R2: {np.mean(r2_scores)}")
    avg_r2.append(np.mean(r2_scores)) 

    # Calculate feature importance
    model.fit(X, y)
    feature_importance = model.feature_importances_
    print("Feature Importance:", feature_importance)

print(f"Average 100 R2: {np.mean(avg_r2)}")

# drop CHAS and ZN R2 = 0.9547
# Average 100 R2: 0.8797

Fold 1: MAPE = 0.10941617994049695 , RMSE = 3.301840758985487, R2 = 0.8837050483750992
Fold 2: MAPE = 0.11990467466186047 , RMSE = 3.5949762692544787, R2 = 0.8587330905952991
Fold 3: MAPE = 0.09636950683857468 , RMSE = 2.61854538567524, R2 = 0.9223568979514596
Fold 4: MAPE = 0.09662993102513447 , RMSE = 4.181058492899588, R2 = 0.7327353193086701
Fold 5: MAPE = 0.10870441821481377 , RMSE = 2.851355552729418, R2 = 0.8984820188769406
Average MAPE: 0.10620494213617607
Average RMSE: 3.3095552919088425
Average R2: 0.8592024750214937
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.09463341740375404 , RMSE = 2.734722449153064, R2 = 0.9167224291157294
Fold 2: MAPE = 0.09333459037484604 , RMSE = 3.008201579018512, R2 = 0.872173891503734
Fold 3: MAPE = 0.09040588959288802 , RMSE = 3.0446217629941685, R2 = 0.8625307467503462
Fold 4: MAPE = 0.1420636849330305 , RMSE = 2.9190828724698994,

Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.09285607776916463 , RMSE = 2.837722135974653, R2 = 0.866566596360974
Fold 2: MAPE = 0.1424588655132706 , RMSE = 3.576591709236165, R2 = 0.8672379051649224
Fold 3: MAPE = 0.10100577043311812 , RMSE = 2.7303555144310856, R2 = 0.9017221433811284
Fold 4: MAPE = 0.08954016601002589 , RMSE = 3.036902082144434, R2 = 0.9051667116759048
Fold 5: MAPE = 0.09788647060784234 , RMSE = 3.2738017836194713, R2 = 0.8779164178037009
Average MAPE: 0.10474947006668431
Average RMSE: 3.091074645081162
Average R2: 0.8837219548773263
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.11836738162562313 , RMSE = 3.1605519982399883, R2 = 0.8905692816352394
Fold 2: MAPE = 0.11646085449980577 , RMSE = 3.311124041365274, R2 = 0.8692423235259259
Fold 

Fold 1: MAPE = 0.08880872618399643 , RMSE = 2.850769330206219, R2 = 0.9062210980301821
Fold 2: MAPE = 0.1510613302677618 , RMSE = 3.2698536656464467, R2 = 0.8832570633552157
Fold 3: MAPE = 0.09526391863636909 , RMSE = 4.219093058975981, R2 = 0.7991769437525374
Fold 4: MAPE = 0.09791258633784901 , RMSE = 3.2846435503551934, R2 = 0.8294299971462549
Fold 5: MAPE = 0.1038157494493616 , RMSE = 2.5308420638133806, R2 = 0.927024201967126
Average MAPE: 0.10737246217506757
Average RMSE: 3.2310403337994438
Average R2: 0.8690218608502633
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.0941805434625164 , RMSE = 3.0122824241644803, R2 = 0.8997896021444836
Fold 2: MAPE = 0.10363161571892628 , RMSE = 2.714534189151328, R2 = 0.9116054873067849
Fold 3: MAPE = 0.11526443006359748 , RMSE = 3.3648640970363175, R2 = 0.8704680677454527
Fold 4: MAPE = 0.08880496105557699 , RMSE = 2.765189308872715

Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.1128761872364365 , RMSE = 2.75949782749309, R2 = 0.9164593362321215
Fold 2: MAPE = 0.1001736481907614 , RMSE = 2.8306661991296567, R2 = 0.9074278575543268
Fold 3: MAPE = 0.14031917779330993 , RMSE = 4.0929230152776634, R2 = 0.7871302763643506
Fold 4: MAPE = 0.10535181010754578 , RMSE = 4.251784465082742, R2 = 0.7720990628508546
Fold 5: MAPE = 0.09993307486618838 , RMSE = 2.876839072692835, R2 = 0.9032243951637772
Average MAPE: 0.11173077963884841
Average RMSE: 3.362342115935198
Average R2: 0.8572681856330862
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.10748331136045662 , RMSE = 2.6549279087208757, R2 = 0.8938751298842365
Fold 2: MAPE = 0.11379128825852844 , RMSE = 4.786195462819639, R2 = 0.7240122718623223
Fold 3

Fold 1: MAPE = 0.09791067660191137 , RMSE = 2.649751473703108, R2 = 0.9049328689490576
Fold 2: MAPE = 0.12350456858068085 , RMSE = 3.6146460152110054, R2 = 0.8285515351850818
Fold 3: MAPE = 0.09895378326778193 , RMSE = 2.7789034804606536, R2 = 0.9241283825978182
Fold 4: MAPE = 0.11107187397939423 , RMSE = 2.88215677996241, R2 = 0.8965449077004245
Fold 5: MAPE = 0.10709019437446457 , RMSE = 4.564651894885455, R2 = 0.7630865742145755
Average MAPE: 0.1077062193608466
Average RMSE: 3.298021928844526
Average R2: 0.8634488537293915
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.10861533860032647 , RMSE = 3.1501265365224067, R2 = 0.8676690199335384
Fold 2: MAPE = 0.09306567723780292 , RMSE = 2.619618446835551, R2 = 0.9185767257450882
Fold 3: MAPE = 0.09828910423493672 , RMSE = 2.702048525041866, R2 = 0.9296851023636664
Fold 4: MAPE = 0.10467139108552082 , RMSE = 4.5101598767438915

Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.10377987218661328 , RMSE = 2.602269196067316, R2 = 0.91361770132802
Fold 2: MAPE = 0.08652942945890729 , RMSE = 2.792254526885744, R2 = 0.9136454488513802
Fold 3: MAPE = 0.10719903062177633 , RMSE = 3.558766995974068, R2 = 0.8405180395204216
Fold 4: MAPE = 0.09517698337974188 , RMSE = 2.883935248420446, R2 = 0.9026951989020895
Fold 5: MAPE = 0.13098671771617043 , RMSE = 3.4447389789100438, R2 = 0.865617538718286
Average MAPE: 0.10473440667264185
Average RMSE: 3.0563929892515236
Average R2: 0.8872187854640394
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.1041711506314928 , RMSE = 2.7597481062768336, R2 = 0.9250404080648587
Fold 2: MAPE = 0.10220543473510386 , RMSE = 2.7183489671173904, R2 = 0.8878187759423912
Fold 3

Fold 1: MAPE = 0.11113917409195247 , RMSE = 3.7122565309588893, R2 = 0.8832085406022836
Fold 2: MAPE = 0.10476164196168151 , RMSE = 2.816776813317424, R2 = 0.9031275313804314
Fold 3: MAPE = 0.09593623453509288 , RMSE = 2.7565186618631543, R2 = 0.8971349112926141
Fold 4: MAPE = 0.11563289405906446 , RMSE = 3.5827776516266474, R2 = 0.801598838436531
Fold 5: MAPE = 0.10336267805394725 , RMSE = 2.91617427829106, R2 = 0.8942366568052302
Average MAPE: 0.10616652454034772
Average RMSE: 3.156900787211435
Average R2: 0.8758612957034181
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.09748505295089069 , RMSE = 3.2751256000321534, R2 = 0.8880914034076268
Fold 2: MAPE = 0.11163631830813878 , RMSE = 3.7713154990862705, R2 = 0.8465212202580632
Fold 3: MAPE = 0.12460131363328189 , RMSE = 2.813391879644671, R2 = 0.878984206462939
Fold 4: MAPE = 0.10728906879741838 , RMSE = 3.135438572549545

Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.11473267565890158 , RMSE = 3.5468961764068534, R2 = 0.7961335904929921
Fold 2: MAPE = 0.13654013266993476 , RMSE = 3.633356463081505, R2 = 0.8613950701556732
Fold 3: MAPE = 0.09728071174115945 , RMSE = 2.6984023717934638, R2 = 0.9235735192281851
Fold 4: MAPE = 0.08647432168719102 , RMSE = 2.5952973756859485, R2 = 0.919930126827367
Fold 5: MAPE = 0.10133043305335342 , RMSE = 3.045870137180753, R2 = 0.888567260788898
Average MAPE: 0.10727165496210804
Average RMSE: 3.1039645048297047
Average R2: 0.8779199134986231
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.11834939278691975 , RMSE = 2.945513434954236, R2 = 0.8688544165397569
Fold 2: MAPE = 0.11037522614370947 , RMSE = 2.857266669202697, R2 = 0.8560204634485113
Fold