In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import os
import tensorflow as tf
# Optional: install mafese and mealpy if not present
# !pip install mealpy mafese
from mafese.wrapper.mha import MultiMhaSelector
os.environ['XGB_USE_CPP_API'] = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Use first GPU device
def rrmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mean_y = np.mean(y_true)
    return rmse / mean_y
def arrmse(y_true, y_pred):
    n_targets = y_true.shape[1]
    rrmse_scores = []
    for i in range(n_targets):
        rrmse_scores.append(rrmse(y_true[:, i], y_pred[:, i]))
    return np.mean(rrmse_scores)


In [None]:
df = pd.read_csv('/kaggle/input/aq-bench/AQbench_dataset.csv')
str_columns = df.select_dtypes(include=['object']).columns.tolist()
str_columns = [col for col in str_columns if col != 'dataset']
label_encoders = {}
for col in str_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str).fillna('NaN'))
    label_encoders[col] = le
def sine_cosine_encode(values, period=None):
    values_array = np.array(values)
    if period is None:
        period = values_array.max()
    sin_values = np.sin(2 * np.pi * values_array / period)
    cos_values = np.cos(2 * np.pi * values_array / period)
    return sin_values, cos_values
df['lonx'], df['lony'] = sine_cosine_encode(df['lon'], period=360)
df = df.drop('lon', axis=1)
var_df = pd.read_csv('/kaggle/input/aq-bench/AQbench_variables.csv')
input_cols = var_df.loc[(var_df['input_target'] == 'input') & (var_df['column_name'] != 'lon'), 'column_name'].tolist()
if 'lon' in input_cols:
    input_cols.remove('lon')
input_cols += ['lonx', 'lony']
target_cols = var_df.loc[var_df['input_target'] == 'target', 'column_name'].tolist()
x_train = df[df['dataset'] == 'train'][input_cols]
y_train = df[df['dataset'] == 'train'][target_cols]
x_test = df[df['dataset'] == 'test'][input_cols]
y_test = df[df['dataset'] == 'test'][target_cols]
x_val = df[df['dataset'] == 'val'][input_cols]
y_val = df[df['dataset'] == 'val'][target_cols]
df = df.drop('dataset', axis=1)
x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_val.shape, y_val.shape


In [None]:
def train_xgboost_multi_target_gpu(x_train, y_train, x_val, y_val):
    models = {}
    predictions_val = np.zeros(y_val.shape)
    for i, target in enumerate(y_train.columns):
        print(f'Training {target}...')
        model = XGBRegressor(tree_method='gpu_hist', predictor='gpu_predictor', n_estimators=100, verbosity=1)
        model.fit(x_train, y_train[target], eval_set=[(x_val, y_val[target])], early_stopping_rounds=10, verbose=False)
        models[target] = model
        predictions_val[:, i] = model.predict(x_val)
    average_val_rrmse = arrmse(y_val.values, predictions_val)
    print(f'Average validation RRMSE: {average_val_rrmse}')
    return models, predictions_val


In [None]:
# MAFESE multi-MHA feature selection with XGBoost
optimizers = MultiMhaSelector.SUPPORT['optimizer']
results = {}
for opt in optimizers:
    print(f'Running {opt}...')
    selector = MultiMhaSelector(problem='regression', obj_name='RMSE', estimator='xgb',
                                list_optimizers=[opt], verbose=False)
    selector.fit(x_train.values, y_train.values)
    selected_indices = selector.selected_feature_indexes
    if hasattr(selected_indices, 'tolist'):
        selected_indices = selected_indices.tolist()
    feat_subset = [input_cols[i] for i in selected_indices]
    xgb_x_train = x_train[feat_subset]
    xgb_x_val   = x_val[feat_subset]
    preds = []
    for tgt in y_train.columns:
        model = XGBRegressor(tree_method='gpu_hist', predictor='gpu_predictor', n_estimators=100)
        model.fit(xgb_x_train, y_train[tgt])
        pred = model.predict(xgb_x_val)
        preds.append(pred)
    preds = np.column_stack(preds)
    val_score = arrmse(y_val.values, preds)
    print(f'{opt} validation ARRSME: {val_score}')
    results[opt] = val_score
results
