In [18]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from pandas import read_csv
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Load Boston Housing dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv('housing.xls', header=None, delimiter=r"\s+", names=column_names)
X = data.drop(columns=['MEDV'])
y = data['MEDV']

# Initialize XGBoost model
# model = XGBRegressor()

# parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
#               'objective':['reg:linear'],
#               'learning_rate': [.03, 0.05, .07], #so called `eta` value
#               'max_depth': [5, 6, 7],
#               'min_child_weight': [4],
#               'silent': [1],
#               'subsample': [0.7],
#               'colsample_bytree': [0.7],
#               'n_estimators': [500]}

# model = GridSearchCV(model,
#                         parameters,
#                         cv = 2,
#                         n_jobs = 5,
#                         verbose=True)

# model.fit(X_train,y_train)

# print(model.best_score_)
# print(model.best_params_)

# {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
params = {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
model = xgb.XGBRegressor(**params) 

avg_r2 = []

for idx in range(100):

    # Define K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)

    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict
        predictions = model.predict(X_test)

        # Calculate MAPE
        mape = np.mean(np.abs((y_test-predictions)/y_test))
        mape_scores.append(mape)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        # Calculate R2
        r2 = r2_score(y_test, predictions)
        r2_scores.append(r2)

    # Output each fold's performance 
    for i in range(5):
        print(f"Fold {i+1}: MAPE = {mape_scores[i]} , RMSE = {rmse_scores[i]}, R2 = {r2_scores[i]}")
    # Output the average performance for 5 folds
    print(f"Average MAPE: {np.mean(mape_scores)}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print(f"Average R2: {np.mean(r2_scores)}")
    avg_r2.append(np.mean(r2_scores)) 

    # Calculate feature importance
    model.fit(X, y)
    feature_importance = model.feature_importances_
    print("Feature Importance:", feature_importance)

print(f"Average 100 R2: {np.mean(avg_r2)}")


# Average 100 R2: 0.8792

Fold 1: MAPE = 0.0905196841954225 , RMSE = 2.4093693911426075, R2 = 0.9220904429803096
Fold 2: MAPE = 0.09790130593329792 , RMSE = 2.7222719334927077, R2 = 0.919779165897826
Fold 3: MAPE = 0.12190034226629222 , RMSE = 4.415353953733743, R2 = 0.8023729008484062
Fold 4: MAPE = 0.10228603203404325 , RMSE = 2.4967736074507094, R2 = 0.8839089489993601
Fold 5: MAPE = 0.10051664590809928 , RMSE = 3.8191558176393103, R2 = 0.8478911264248795
Average MAPE: 0.10262480206743103
Average RMSE: 3.1725849406918156
Average R2: 0.8752085170301562
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.09320372904284686 , RMSE = 3.149729833023455, R2 = 0.8582637910038731
Fold 2: MAPE = 0.09360883851648055 , RMSE = 2.3522121804769913, R2 = 0.9354381033148687
Fold 3: MAPE = 0.12443150966546102 , RMSE = 3.6122042985410454, R2 = 0.8519513902170126
Fold 4: MAPE = 0.12161494337152624 

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.09329725002803488 , RMSE = 2.6020351768662975, R2 = 0.9044462304925838
Fold 2: MAPE = 0.09367177566616725 , RMSE = 2.5927817204821824, R2 = 0.903332826768103
Fold 3: MAPE = 0.09577522390756482 , RMSE = 2.7763081552495175, R2 = 0.8950403961649858
Fold 4: MAPE = 0.12378467257777843 , RMSE = 3.998721129502024, R2 = 0.8522542180172015
Fold 5: MAPE = 0.11483062848549958 , RMSE = 3.1802653113961386, R2 = 0.8973216620726476
Average MAPE: 0.10427191013300899
Average RMSE: 3.030022298699232
Average R2: 0.8904790667031044
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.08229646027641285 , RMSE = 2.7069814179231084, R2 = 0.930402699403494
Fold 2: MAPE = 0.1402047970397308 , RMSE = 3

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.10663099533154127 , RMSE = 3.294542936477248, R2 = 0.8884231638833219
Fold 2: MAPE = 0.11561611521675891 , RMSE = 3.3011725015329607, R2 = 0.8824030303205349
Fold 3: MAPE = 0.09537212509661719 , RMSE = 3.578262198955731, R2 = 0.8243289550947678
Fold 4: MAPE = 0.09555376302727692 , RMSE = 2.5167207291924774, R2 = 0.9190669107298801
Fold 5: MAPE = 0.10715536025553138 , RMSE = 2.7170279887832134, R2 = 0.905323847102499
Average MAPE: 0.10406567178554513
Average RMSE: 3.081545270988326
Average R2: 0.8839091814262009
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.11598525713967582 , RMSE = 3.352524838536883, R2 = 0.8874866489380007
Fold 2: MAPE = 0.08980811572660072 , RMSE = 3

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.12471098062366086 , RMSE = 3.9417830263314446, R2 = 0.8265849539912885
Fold 2: MAPE = 0.10531735304552568 , RMSE = 3.2132235479142692, R2 = 0.9053518968020716
Fold 3: MAPE = 0.09102255545149149 , RMSE = 2.564457759636557, R2 = 0.9223011465371902
Fold 4: MAPE = 0.10926407106714583 , RMSE = 2.515983318258804, R2 = 0.9140320904531615
Fold 5: MAPE = 0.09154850761244067 , RMSE = 2.8437465702510356, R2 = 0.8677399566320175
Average MAPE: 0.10437269356005292
Average RMSE: 3.0158388444784223
Average R2: 0.8872020088831458
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.10897072585013874 , RMSE = 3.5531264246384997, R2 = 0.8818343364834496
Fold 2: MAPE = 0.09687138237294228 , RMSE 

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.09950264473282783 , RMSE = 3.393126030633141, R2 = 0.8912105967483165
Fold 2: MAPE = 0.12092530535141668 , RMSE = 3.7086459391055944, R2 = 0.8422987034149587
Fold 3: MAPE = 0.09107576094368222 , RMSE = 2.7356438511146486, R2 = 0.89728421993828
Fold 4: MAPE = 0.10491403799196447 , RMSE = 3.002665240920507, R2 = 0.8655709238859981
Fold 5: MAPE = 0.09671067129586068 , RMSE = 2.717068256056987, R2 = 0.9146630140326173
Average MAPE: 0.10262568406315038
Average RMSE: 3.1114298635661752
Average R2: 0.882205491604034
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.12989263322139483 , RMSE = 3.565376317491304, R2 = 0.8635807877976309
Fold 2: MAPE = 0.0978657832011524 , RMSE = 3.35

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.09748383612070499 , RMSE = 2.518941701507674, R2 = 0.9101716083742647
Fold 2: MAPE = 0.11096060253394036 , RMSE = 3.5018545852735614, R2 = 0.8623581993473375
Fold 3: MAPE = 0.09975416556689749 , RMSE = 2.6760878285552088, R2 = 0.9205890654913843
Fold 4: MAPE = 0.09946572367277529 , RMSE = 3.5394809885654, R2 = 0.8401572742913467
Fold 5: MAPE = 0.13962114217659294 , RMSE = 3.1627744573306105, R2 = 0.8920059166154232
Average MAPE: 0.10945709401418222
Average RMSE: 3.079827912246491
Average R2: 0.8850564128239512
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.12032741149554348 , RMSE = 3.3377028253948398, R2 = 0.8782609747843265
Fold 2: MAPE = 0.11459124685985389 , RMSE = 3

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.08153563175225272 , RMSE = 4.146272808069097, R2 = 0.7955796259059812
Fold 2: MAPE = 0.11498679121783979 , RMSE = 2.9408775992412832, R2 = 0.8940897035952747
Fold 3: MAPE = 0.10523365967997854 , RMSE = 2.862544235000701, R2 = 0.8807577815606834
Fold 4: MAPE = 0.09181362547651543 , RMSE = 2.898127563437027, R2 = 0.9231870672709137
Fold 5: MAPE = 0.12081512669984745 , RMSE = 3.7194638921755474, R2 = 0.798928170556841
Average MAPE: 0.10287696696528678
Average RMSE: 3.3134572195847314
Average R2: 0.8585084697779388
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.11326783420591299 , RMSE = 2.6636682112222188, R2 = 0.9121937154091846
Fold 2: MAPE = 0.08663638157831015 , RMSE = 

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.09954873256406045 , RMSE = 3.167475169486289, R2 = 0.9008889419451156
Fold 2: MAPE = 0.09605656949584558 , RMSE = 2.4827783381997857, R2 = 0.9168522109966795
Fold 3: MAPE = 0.10207668271161577 , RMSE = 2.9951174251852817, R2 = 0.902013545571709
Fold 4: MAPE = 0.11688881463235692 , RMSE = 3.2503504900954723, R2 = 0.8415925424301964
Fold 5: MAPE = 0.107039266502869 , RMSE = 3.7080041413363465, R2 = 0.8410544919302965
Average MAPE: 0.10432201318134955
Average RMSE: 3.120745112860635
Average R2: 0.8804803465747992
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.10411047063431138 , RMSE = 2.5106959346749513, R2 = 0.9190587380144328
Fold 2: MAPE = 0.11762379375973676 , RMSE = 4

Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.0983767513840425 , RMSE = 3.330895667846088, R2 = 0.8142984480408999
Fold 2: MAPE = 0.1154263245429055 , RMSE = 3.3543294467622, R2 = 0.8901555943430642
Fold 3: MAPE = 0.11376067264339461 , RMSE = 3.8298185451033144, R2 = 0.8602200425766082
Fold 4: MAPE = 0.08317184518483765 , RMSE = 2.317645553153128, R2 = 0.9294905869022758
Fold 5: MAPE = 0.11409570892142926 , RMSE = 2.3900008999458837, R2 = 0.9248865522338445
Average MAPE: 0.1049662605353219
Average RMSE: 3.044538022562123
Average R2: 0.8838102448193383
Feature Importance: [0.05144093 0.01407929 0.03652145 0.03595443 0.07972205 0.25435647
 0.02149454 0.04902536 0.02008983 0.04088116 0.10879766 0.01747625
 0.27016062]
Fold 1: MAPE = 0.10386183286674759 , RMSE = 2.6993995958107195, R2 = 0.9199915967091961
Fold 2: MAPE = 0.09906034286370125 , RMSE = 3.357

In [3]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from pandas import read_csv
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Load Boston Housing dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv('housing.xls', header=None, delimiter=r"\s+", names=column_names)
data = data.drop(columns=['CHAS'])
X = data.drop(columns=['MEDV'])
y = data['MEDV']

# Initialize XGBoost model
# model = XGBRegressor()

# parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
#               'objective':['reg:linear'],
#               'learning_rate': [.03, 0.05, .07], #so called `eta` value
#               'max_depth': [5, 6, 7],
#               'min_child_weight': [4],
#               'silent': [1],
#               'subsample': [0.7],
#               'colsample_bytree': [0.7],
#               'n_estimators': [500]}

# model = GridSearchCV(model,
#                         parameters,
#                         cv = 2,
#                         n_jobs = 5,
#                         verbose=True)

# model.fit(X_train,y_train)

# print(model.best_score_)
# print(model.best_params_)

# {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
params = {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
model = xgb.XGBRegressor(**params) 

avg_r2 = []

for idx in range(100):

    # Define K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)

    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict
        predictions = model.predict(X_test)

        # Calculate MAPE
        mape = np.mean(np.abs((y_test-predictions)/y_test))
        mape_scores.append(mape)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        # Calculate R2
        r2 = r2_score(y_test, predictions)
        r2_scores.append(r2)

    # Output each fold's performance 
    for i in range(5):
        print(f"Fold {i+1}: MAPE = {mape_scores[i]} , RMSE = {rmse_scores[i]}, R2 = {r2_scores[i]}")
    # Output the average performance for 5 folds
    print(f"Average MAPE: {np.mean(mape_scores)}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print(f"Average R2: {np.mean(r2_scores)}")
    avg_r2.append(np.mean(r2_scores)) 

    # Calculate feature importance
    model.fit(X, y)
    feature_importance = model.feature_importances_
    print("Feature Importance:", feature_importance)

print(f"Average 100 R2: {np.mean(avg_r2)}")

# drop chas R2 = 0.95524
# Average 100 R2: 0.8810

Fold 1: MAPE = 0.09810214543987758 , RMSE = 2.999953369719047, R2 = 0.9156777933389851
Fold 2: MAPE = 0.09297794680148976 , RMSE = 3.6525241158548676, R2 = 0.8703561701304972
Fold 3: MAPE = 0.09415286120571968 , RMSE = 2.6525152775751395, R2 = 0.9195158841990907
Fold 4: MAPE = 0.10812589384927859 , RMSE = 2.845123883944282, R2 = 0.8830773921525775
Fold 5: MAPE = 0.11098757447080836 , RMSE = 2.848262879297012, R2 = 0.8491872452596134
Average MAPE: 0.10086928435343478
Average RMSE: 2.9996759052780697
Average R2: 0.8875628970161529
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.11281837057857023 , RMSE = 3.3663206356120816, R2 = 0.8955273137484612
Fold 2: MAPE = 0.09187560147186483 , RMSE = 2.5412137134051083, R2 = 0.9322787049152181
Fold 3: MAPE = 0.11387547298364532 , RMSE = 2.8940189143605197, R2 = 0.8618783480316394
Fold 4: MAPE = 0.09046182852226065 , RMSE = 3.

Fold 1: MAPE = 0.08623998508175404 , RMSE = 2.380213510418715, R2 = 0.9133858328643458
Fold 2: MAPE = 0.10304606606400611 , RMSE = 3.2332367229478716, R2 = 0.8435492972836446
Fold 3: MAPE = 0.12499932235261646 , RMSE = 3.714104485549449, R2 = 0.8551342039332587
Fold 4: MAPE = 0.12150844930899332 , RMSE = 3.666209735240694, R2 = 0.8411257951802645
Fold 5: MAPE = 0.113610004396474 , RMSE = 2.9481286811634213, R2 = 0.9204003332478838
Average MAPE: 0.10988076544076877
Average RMSE: 3.18837862706403
Average R2: 0.8747190925018795
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.11629130853001228 , RMSE = 3.879621261315946, R2 = 0.8656800070754453
Fold 2: MAPE = 0.12836251429483211 , RMSE = 3.65216488195402, R2 = 0.8335370525869337
Fold 3: MAPE = 0.0828577784945328 , RMSE = 2.300573759704407, R2 = 0.93327947500167
Fold 4: MAPE = 0.10719348236524893 , RMSE = 2.77760248872

Fold 1: MAPE = 0.11743784545881486 , RMSE = 3.087274539805779, R2 = 0.8951878171808182
Fold 2: MAPE = 0.09971168519159986 , RMSE = 3.455243196935107, R2 = 0.8381363792980214
Fold 3: MAPE = 0.08674367255281103 , RMSE = 2.34889554765708, R2 = 0.926005004565108
Fold 4: MAPE = 0.12028109430429912 , RMSE = 3.3946414736478205, R2 = 0.8827832525729976
Fold 5: MAPE = 0.09735797378716984 , RMSE = 2.4947988377907193, R2 = 0.9231542924903874
Average MAPE: 0.10430645425893895
Average RMSE: 2.956170719167301
Average R2: 0.8930533492214664
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.1012620495476327 , RMSE = 2.503364590405743, R2 = 0.9223956147488499
Fold 2: MAPE = 0.09601529160363348 , RMSE = 2.7536165561514965, R2 = 0.9097717115696673
Fold 3: MAPE = 0.10196340240419874 , RMSE = 4.901673318712089, R2 = 0.7215052128580399
Fold 4: MAPE = 0.11555394313812627 , RMSE = 2.747444

Fold 1: MAPE = 0.08962730878355671 , RMSE = 2.1688732184960697, R2 = 0.9550538166955366
Fold 2: MAPE = 0.10369854458488734 , RMSE = 3.7619476657794837, R2 = 0.8221940461893268
Fold 3: MAPE = 0.09518263749736187 , RMSE = 3.3024257818370564, R2 = 0.8392900917604175
Fold 4: MAPE = 0.10239813282865462 , RMSE = 2.8563566404304375, R2 = 0.9095105090071457
Fold 5: MAPE = 0.11678169350101414 , RMSE = 2.712472128041345, R2 = 0.9063694932589957
Average MAPE: 0.10153766343909494
Average RMSE: 2.9604150869168784
Average R2: 0.8864835913822844
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.12232711081380145 , RMSE = 3.5753027621609967, R2 = 0.8725660591340563
Fold 2: MAPE = 0.1025704152629818 , RMSE = 2.7891645401954146, R2 = 0.8946543533029591
Fold 3: MAPE = 0.09696275207653664 , RMSE = 4.197062741307891, R2 = 0.7349487975313658
Fold 4: MAPE = 0.10921084955304468 , RMSE = 2.

Fold 1: MAPE = 0.0965131875148408 , RMSE = 2.72452876202076, R2 = 0.9086298820768475
Fold 2: MAPE = 0.10646081276417783 , RMSE = 3.863529568496472, R2 = 0.8563662025081853
Fold 3: MAPE = 0.12592460121354782 , RMSE = 3.529348777004328, R2 = 0.8641032759017186
Fold 4: MAPE = 0.09613844103567888 , RMSE = 2.4377543538001576, R2 = 0.9135186858117251
Fold 5: MAPE = 0.10484217251291067 , RMSE = 2.9485211543223744, R2 = 0.8834735280767987
Average MAPE: 0.1059758430082312
Average RMSE: 3.1007365231288184
Average R2: 0.8852183148750552
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.09196640982835248 , RMSE = 3.465302645324363, R2 = 0.861210125847576
Fold 2: MAPE = 0.09532219250297748 , RMSE = 3.146545582642703, R2 = 0.8672914366410162
Fold 3: MAPE = 0.10389125976165262 , RMSE = 3.5632093160957203, R2 = 0.8225668818107013
Fold 4: MAPE = 0.11765317428725369 , RMSE = 2.793076

Fold 1: MAPE = 0.10157060573046213 , RMSE = 3.4054830568328107, R2 = 0.8567460728847114
Fold 2: MAPE = 0.0990150415782791 , RMSE = 2.7051276959313246, R2 = 0.9111677961697757
Fold 3: MAPE = 0.09980017430156364 , RMSE = 2.6375286295307827, R2 = 0.9210705256884119
Fold 4: MAPE = 0.1165140622464136 , RMSE = 4.530246923991567, R2 = 0.7648513798082044
Fold 5: MAPE = 0.11758748410672942 , RMSE = 2.9792543977488966, R2 = 0.8923016506861502
Average MAPE: 0.10689747359268957
Average RMSE: 3.251528140807076
Average R2: 0.8692274850474506
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.11677498980676197 , RMSE = 2.9644403554674676, R2 = 0.8790708492809097
Fold 2: MAPE = 0.11266583253695137 , RMSE = 3.0785031600831765, R2 = 0.8585748624357683
Fold 3: MAPE = 0.09403716779916324 , RMSE = 2.4970301079531496, R2 = 0.916405487997179
Fold 4: MAPE = 0.1281844867019102 , RMSE = 3.682

Fold 1: MAPE = 0.09270144231729273 , RMSE = 2.604001753664416, R2 = 0.9123655045482779
Fold 2: MAPE = 0.09789638336196976 , RMSE = 2.3844124673532296, R2 = 0.9066914236832714
Fold 3: MAPE = 0.1139042593235551 , RMSE = 5.156623763087617, R2 = 0.7674886199571297
Fold 4: MAPE = 0.10739043975381193 , RMSE = 3.0099212062127436, R2 = 0.8878633735736325
Fold 5: MAPE = 0.10378015536078863 , RMSE = 2.584373081195746, R2 = 0.9214904244197835
Average MAPE: 0.10313453602348363
Average RMSE: 3.14786645430275
Average R2: 0.8791798692364189
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.09577096323598995 , RMSE = 4.13802076121706, R2 = 0.822178252201933
Fold 2: MAPE = 0.10109867037651414 , RMSE = 2.423693181136903, R2 = 0.9061569677708279
Fold 3: MAPE = 0.10716205747010252 , RMSE = 3.7510573728238334, R2 = 0.8447404996121902
Fold 4: MAPE = 0.10327723750755859 , RMSE = 2.5516717

Fold 1: MAPE = 0.1034905215331819 , RMSE = 3.041031307714836, R2 = 0.8785169372236009
Fold 2: MAPE = 0.1136891453982005 , RMSE = 2.8415521680787084, R2 = 0.9067445538691035
Fold 3: MAPE = 0.11767063104025925 , RMSE = 3.4017253276728683, R2 = 0.8474385904668847
Fold 4: MAPE = 0.10299307916742562 , RMSE = 3.2475424503735915, R2 = 0.8788268127409364
Fold 5: MAPE = 0.09398635789472859 , RMSE = 3.051884358227632, R2 = 0.900224391995722
Average MAPE: 0.10636594700675919
Average RMSE: 3.1167471224135275
Average R2: 0.8823502572592495
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.0994131742341517 , RMSE = 2.6879650288314685, R2 = 0.9144332319200791
Fold 2: MAPE = 0.11263565237736939 , RMSE = 4.380397991741744, R2 = 0.8004290546256568
Fold 3: MAPE = 0.11117407876072645 , RMSE = 3.151881801351616, R2 = 0.905641454199614
Fold 4: MAPE = 0.1244422834086498 , RMSE = 3.6477465

Fold 1: MAPE = 0.09456529020660358 , RMSE = 3.1691292036986933, R2 = 0.8946577714612411
Fold 2: MAPE = 0.1330788556826197 , RMSE = 3.730713026828328, R2 = 0.7783054320463091
Fold 3: MAPE = 0.10511751157872512 , RMSE = 3.4021080082919686, R2 = 0.8560849307582371
Fold 4: MAPE = 0.08134687840450033 , RMSE = 2.4091064989354174, R2 = 0.9369995327775812
Fold 5: MAPE = 0.10928395963778406 , RMSE = 3.046518414805165, R2 = 0.8963182081736565
Average MAPE: 0.10467849910204656
Average RMSE: 3.151515030511914
Average R2: 0.8724731750434049
Feature Importance: [0.03700862 0.02312171 0.06894361 0.08441484 0.26315826 0.02063493
 0.0509517  0.03300228 0.03883713 0.09991075 0.02060098 0.25941518]
Fold 1: MAPE = 0.08479877270715815 , RMSE = 3.0053762574102056, R2 = 0.8950946740888837
Fold 2: MAPE = 0.09855100969744504 , RMSE = 2.446079980476026, R2 = 0.9273440295525041
Fold 3: MAPE = 0.09595412182321629 , RMSE = 2.6655576743544107, R2 = 0.9149441334881585
Fold 4: MAPE = 0.11351237346333783 , RMSE = 3.54

In [2]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from pandas import read_csv
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Load Boston Housing dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = read_csv('housing.xls', header=None, delimiter=r"\s+", names=column_names)
data = data.drop(columns=['CHAS','ZN'])
X = data.drop(columns=['MEDV'])
y = data['MEDV']

# Initialize XGBoost model
# model = XGBRegressor()

# parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
#               'objective':['reg:linear'],
#               'learning_rate': [.03, 0.05, .07], #so called `eta` value
#               'max_depth': [5, 6, 7],
#               'min_child_weight': [4],
#               'silent': [1],
#               'subsample': [0.7],
#               'colsample_bytree': [0.7],
#               'n_estimators': [500]}

# model = GridSearchCV(model,
#                         parameters,
#                         cv = 2,
#                         n_jobs = 5,
#                         verbose=True)

# model.fit(X_train,y_train)

# print(model.best_score_)
# print(model.best_params_)

# {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
params = {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
model = xgb.XGBRegressor(**params) 

avg_r2 = []

for idx in range(100):

    # Define K-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True)

    mape_scores, rmse_scores, r2_scores = [], [], []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict
        predictions = model.predict(X_test)

        # Calculate MAPE
        mape = np.mean(np.abs((y_test-predictions)/y_test))
        mape_scores.append(mape)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        rmse_scores.append(rmse)

        # Calculate R2
        r2 = r2_score(y_test, predictions)
        r2_scores.append(r2)

    # Output each fold's performance 
    for i in range(5):
        print(f"Fold {i+1}: MAPE = {mape_scores[i]} , RMSE = {rmse_scores[i]}, R2 = {r2_scores[i]}")
    # Output the average performance for 5 folds
    print(f"Average MAPE: {np.mean(mape_scores)}")
    print(f"Average RMSE: {np.mean(rmse_scores)}")
    print(f"Average R2: {np.mean(r2_scores)}")
    avg_r2.append(np.mean(r2_scores)) 

    # Calculate feature importance
    model.fit(X, y)
    feature_importance = model.feature_importances_
    print("Feature Importance:", feature_importance)

print(f"Average 100 R2: {np.mean(avg_r2)}")

# drop CHAS and ZN R2 = 0.9547
# Average 100 R2: 0.8797

Fold 1: MAPE = 0.10941617994049695 , RMSE = 3.301840758985487, R2 = 0.8837050483750992
Fold 2: MAPE = 0.11990467466186047 , RMSE = 3.5949762692544787, R2 = 0.8587330905952991
Fold 3: MAPE = 0.09636950683857468 , RMSE = 2.61854538567524, R2 = 0.9223568979514596
Fold 4: MAPE = 0.09662993102513447 , RMSE = 4.181058492899588, R2 = 0.7327353193086701
Fold 5: MAPE = 0.10870441821481377 , RMSE = 2.851355552729418, R2 = 0.8984820188769406
Average MAPE: 0.10620494213617607
Average RMSE: 3.3095552919088425
Average R2: 0.8592024750214937
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.09463341740375404 , RMSE = 2.734722449153064, R2 = 0.9167224291157294
Fold 2: MAPE = 0.09333459037484604 , RMSE = 3.008201579018512, R2 = 0.872173891503734
Fold 3: MAPE = 0.09040588959288802 , RMSE = 3.0446217629941685, R2 = 0.8625307467503462
Fold 4: MAPE = 0.1420636849330305 , RMSE = 2.9190828724698994,

Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.09285607776916463 , RMSE = 2.837722135974653, R2 = 0.866566596360974
Fold 2: MAPE = 0.1424588655132706 , RMSE = 3.576591709236165, R2 = 0.8672379051649224
Fold 3: MAPE = 0.10100577043311812 , RMSE = 2.7303555144310856, R2 = 0.9017221433811284
Fold 4: MAPE = 0.08954016601002589 , RMSE = 3.036902082144434, R2 = 0.9051667116759048
Fold 5: MAPE = 0.09788647060784234 , RMSE = 3.2738017836194713, R2 = 0.8779164178037009
Average MAPE: 0.10474947006668431
Average RMSE: 3.091074645081162
Average R2: 0.8837219548773263
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.11836738162562313 , RMSE = 3.1605519982399883, R2 = 0.8905692816352394
Fold 2: MAPE = 0.11646085449980577 , RMSE = 3.311124041365274, R2 = 0.8692423235259259
Fold 

Fold 1: MAPE = 0.08880872618399643 , RMSE = 2.850769330206219, R2 = 0.9062210980301821
Fold 2: MAPE = 0.1510613302677618 , RMSE = 3.2698536656464467, R2 = 0.8832570633552157
Fold 3: MAPE = 0.09526391863636909 , RMSE = 4.219093058975981, R2 = 0.7991769437525374
Fold 4: MAPE = 0.09791258633784901 , RMSE = 3.2846435503551934, R2 = 0.8294299971462549
Fold 5: MAPE = 0.1038157494493616 , RMSE = 2.5308420638133806, R2 = 0.927024201967126
Average MAPE: 0.10737246217506757
Average RMSE: 3.2310403337994438
Average R2: 0.8690218608502633
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.0941805434625164 , RMSE = 3.0122824241644803, R2 = 0.8997896021444836
Fold 2: MAPE = 0.10363161571892628 , RMSE = 2.714534189151328, R2 = 0.9116054873067849
Fold 3: MAPE = 0.11526443006359748 , RMSE = 3.3648640970363175, R2 = 0.8704680677454527
Fold 4: MAPE = 0.08880496105557699 , RMSE = 2.765189308872715

Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.1128761872364365 , RMSE = 2.75949782749309, R2 = 0.9164593362321215
Fold 2: MAPE = 0.1001736481907614 , RMSE = 2.8306661991296567, R2 = 0.9074278575543268
Fold 3: MAPE = 0.14031917779330993 , RMSE = 4.0929230152776634, R2 = 0.7871302763643506
Fold 4: MAPE = 0.10535181010754578 , RMSE = 4.251784465082742, R2 = 0.7720990628508546
Fold 5: MAPE = 0.09993307486618838 , RMSE = 2.876839072692835, R2 = 0.9032243951637772
Average MAPE: 0.11173077963884841
Average RMSE: 3.362342115935198
Average R2: 0.8572681856330862
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.10748331136045662 , RMSE = 2.6549279087208757, R2 = 0.8938751298842365
Fold 2: MAPE = 0.11379128825852844 , RMSE = 4.786195462819639, R2 = 0.7240122718623223
Fold 3

Fold 1: MAPE = 0.09791067660191137 , RMSE = 2.649751473703108, R2 = 0.9049328689490576
Fold 2: MAPE = 0.12350456858068085 , RMSE = 3.6146460152110054, R2 = 0.8285515351850818
Fold 3: MAPE = 0.09895378326778193 , RMSE = 2.7789034804606536, R2 = 0.9241283825978182
Fold 4: MAPE = 0.11107187397939423 , RMSE = 2.88215677996241, R2 = 0.8965449077004245
Fold 5: MAPE = 0.10709019437446457 , RMSE = 4.564651894885455, R2 = 0.7630865742145755
Average MAPE: 0.1077062193608466
Average RMSE: 3.298021928844526
Average R2: 0.8634488537293915
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.10861533860032647 , RMSE = 3.1501265365224067, R2 = 0.8676690199335384
Fold 2: MAPE = 0.09306567723780292 , RMSE = 2.619618446835551, R2 = 0.9185767257450882
Fold 3: MAPE = 0.09828910423493672 , RMSE = 2.702048525041866, R2 = 0.9296851023636664
Fold 4: MAPE = 0.10467139108552082 , RMSE = 4.5101598767438915

Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.10377987218661328 , RMSE = 2.602269196067316, R2 = 0.91361770132802
Fold 2: MAPE = 0.08652942945890729 , RMSE = 2.792254526885744, R2 = 0.9136454488513802
Fold 3: MAPE = 0.10719903062177633 , RMSE = 3.558766995974068, R2 = 0.8405180395204216
Fold 4: MAPE = 0.09517698337974188 , RMSE = 2.883935248420446, R2 = 0.9026951989020895
Fold 5: MAPE = 0.13098671771617043 , RMSE = 3.4447389789100438, R2 = 0.865617538718286
Average MAPE: 0.10473440667264185
Average RMSE: 3.0563929892515236
Average R2: 0.8872187854640394
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.1041711506314928 , RMSE = 2.7597481062768336, R2 = 0.9250404080648587
Fold 2: MAPE = 0.10220543473510386 , RMSE = 2.7183489671173904, R2 = 0.8878187759423912
Fold 3

Fold 1: MAPE = 0.11113917409195247 , RMSE = 3.7122565309588893, R2 = 0.8832085406022836
Fold 2: MAPE = 0.10476164196168151 , RMSE = 2.816776813317424, R2 = 0.9031275313804314
Fold 3: MAPE = 0.09593623453509288 , RMSE = 2.7565186618631543, R2 = 0.8971349112926141
Fold 4: MAPE = 0.11563289405906446 , RMSE = 3.5827776516266474, R2 = 0.801598838436531
Fold 5: MAPE = 0.10336267805394725 , RMSE = 2.91617427829106, R2 = 0.8942366568052302
Average MAPE: 0.10616652454034772
Average RMSE: 3.156900787211435
Average R2: 0.8758612957034181
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.09748505295089069 , RMSE = 3.2751256000321534, R2 = 0.8880914034076268
Fold 2: MAPE = 0.11163631830813878 , RMSE = 3.7713154990862705, R2 = 0.8465212202580632
Fold 3: MAPE = 0.12460131363328189 , RMSE = 2.813391879644671, R2 = 0.878984206462939
Fold 4: MAPE = 0.10728906879741838 , RMSE = 3.135438572549545

Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.11473267565890158 , RMSE = 3.5468961764068534, R2 = 0.7961335904929921
Fold 2: MAPE = 0.13654013266993476 , RMSE = 3.633356463081505, R2 = 0.8613950701556732
Fold 3: MAPE = 0.09728071174115945 , RMSE = 2.6984023717934638, R2 = 0.9235735192281851
Fold 4: MAPE = 0.08647432168719102 , RMSE = 2.5952973756859485, R2 = 0.919930126827367
Fold 5: MAPE = 0.10133043305335342 , RMSE = 3.045870137180753, R2 = 0.888567260788898
Average MAPE: 0.10727165496210804
Average RMSE: 3.1039645048297047
Average R2: 0.8779199134986231
Feature Importance: [0.04689124 0.11248615 0.06841633 0.24031155 0.02076505 0.04842381
 0.02982849 0.04719615 0.0657042  0.02164141 0.2983356 ]
Fold 1: MAPE = 0.11834939278691975 , RMSE = 2.945513434954236, R2 = 0.8688544165397569
Fold 2: MAPE = 0.11037522614370947 , RMSE = 2.857266669202697, R2 = 0.8560204634485113
Fold