# XGB 10-fold CV

In [1]:
%pwd

'/mnt/g/D-storage/11_Article/03_RStudio/02_PyCode'

In [2]:
%cd ..

/mnt/g/D-storage/11_Article/03_RStudio


## Import Package

In [12]:
import numpy as np
import os
import pandas as pd
import pyreadr
import xgboost as xgb

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import r2_score

In [7]:
def getXandStanYnoah():
    df = pd.read_csv("04_Data/98_DatasetWithNoah.csv")
    df.set_index(['GridID', 'time'], inplace=True)
    df.dropna(inplace=True)
    df_output = df.copy()
    aim_variable_list = ['lowSpeedDensity',  
                         'tair', 'psurf', 'qair', 'wind', 'rainf',
                         'NTL', 'NDVI', 'PBLH']
    for variable_name in aim_variable_list:
        df_output[variable_name] = df_output.groupby('GridID')[variable_name].transform(lambda x: (x - x.mean()) / x.std())
    
    X = df_output.iloc[:,1:df.shape[1]].copy()
    y = df_output.iloc[:,0:1].copy()

    return df_output, X, y

## Run

In [9]:
df_output, X, y = getXandStanYnoah()

In [20]:
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators = 3000, learning_rate = 0.3,
                         max_depth = 17, min_child_weight = 2, gamma = 0, 
                         subsample = 1, colsample_bytree = 0.8, reg_alpha = 0.2,
                         reg_lambda = 0.5, device = 'cuda')

# Define 10-fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [21]:
# Store R² scores
train_r2_scores = []
val_r2_scores = []

In [22]:
for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index,:], X.iloc[val_index, :]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Fit model on training data
    xgb_reg.fit(X_train, y_train)

    # Predict on both training and validation data
    y_train_pred = xgb_reg.predict(X_train)
    y_val_pred = xgb_reg.predict(X_val)

    # Compute R² scores
    train_r2 = r2_score(y_train, y_train_pred)
    val_r2 = r2_score(y_val, y_val_pred)

    # Store scores
    train_r2_scores.append(train_r2)
    val_r2_scores.append(val_r2)

In [23]:
# Print Results
print(f"Training R² Scores per Fold: {train_r2_scores}")
print(f"Validation R² Scores per Fold: {val_r2_scores}")
print(f"Mean Training R²: {np.mean(train_r2_scores):.4f}")
print(f"Mean Validation R²: {np.mean(val_r2_scores):.4f}")
print(f"Standard Deviation of Validation R²: {np.std(val_r2_scores):.4f}")

Training R² Scores per Fold: [0.9994272750883809, 0.9993651428207633, 0.9993987598584928, 0.9993743801626253, 0.9994437249773318, 0.9994100251114929, 0.9994021840348112, 0.9993986901215401, 0.9994609266005192, 0.9994509934314462]
Validation R² Scores per Fold: [0.8807305887760921, 0.8821775571172079, 0.8827847970647351, 0.8850222283984392, 0.8849706701055531, 0.8840197392149112, 0.8837678079980943, 0.8863258605204244, 0.8850162673642769, 0.8817167255078056]
Mean Training R²: 0.9994
Mean Validation R²: 0.8837
Standard Deviation of Validation R²: 0.0017


## Check Importance

In [9]:
df_output, X, y = getXandStanYnoah()

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
                                                    random_state=42)

In [14]:
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators = 3000, learning_rate = 0.3,
                         max_depth = 17, min_child_weight = 2, gamma = 0, 
                         subsample = 1, colsample_bytree = 0.8, reg_alpha = 0.2,
                         reg_lambda = 0.5, device = 'cuda')
xgb_reg.fit(X_train, y_train)

In [17]:
feature_importance = xgb_reg.get_booster().get_score(importance_type='weight')

In [20]:
df_importance = pd.DataFrame.from_dict(feature_importance, orient='index', columns=['Importance'])

In [22]:
df_importance.index

Index(['NTL', 'NDVI', 'PBLH', 'prevalance', 'mortality', 'emergence', 'x', 'y',
       'tair', 'psurf', 'qair', 'wind', 'rainf'],
      dtype='object')

In [24]:
df_importance.index = ['NTL', 'NDVI', 'PBLH', 'Prevalence', 'Mortality', 'Emergence',
                       'Longitude', 'Latitude',
                      'Temperature', 'Air Pressure', 'Humidity', 'Wind Speed', 'Precipitation']

In [27]:
df_importance.loc[['Temperature', 'Air Pressure', 'Humidity', 'Wind Speed', 'Precipitation',
                   'NTL', 'NDVI', 'PBLH', 'Prevalence', 'Mortality', 'Emergence',
                    'Longitude', 'Latitude'], :].to_csv('12_Results0618/importance.csv')