In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import os
import tensorflow as tf
!pip install mafese mealpy plotly==6.3.0

os.environ['XGB_USE_CPP_API'] = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

def rrmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mean_y = np.mean(y_true)
    return rmse / mean_y

def arrmse(y_true, y_pred):
    n_targets = y_true.shape[1]
    rrmse_scores = []
    for i in range(n_targets):
        rrmse_scores.append(rrmse(y_true[:, i], y_pred[:, i]))
    return np.mean(rrmse_scores)


In [None]:
df = pd.read_csv('/kaggle/input/aq-bench/AQbench_dataset.csv')
str_columns = df.select_dtypes(include=['object']).columns.tolist()
str_columns = [col for col in str_columns if col != 'dataset']
label_encoders = {}
for col in str_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str).fillna('NaN'))
    label_encoders[col] = le
def sine_cosine_encode(values, period=None):
    values_array = np.array(values)
    if period is None:
        period = values_array.max()
    sin_values = np.sin(2 * np.pi * values_array / period)
    cos_values = np.cos(2 * np.pi * values_array / period)
    return sin_values, cos_values
df['lonx'], df['lony'] = sine_cosine_encode(df['lon'], period=360)
df = df.drop('lon', axis=1)
var_df = pd.read_csv('/kaggle/input/aq-bench/AQbench_variables.csv')
input_cols = var_df.loc[(var_df['input_target'] == 'input') & (var_df['column_name'] != 'lon'), 'column_name'].tolist()
if 'lon' in input_cols:
    input_cols.remove('lon')
input_cols += ['lonx', 'lony']
target_cols = var_df.loc[var_df['input_target'] == 'target', 'column_name'].tolist()
x_train = df[df['dataset'] == 'train'][input_cols]
y_train = df[df['dataset'] == 'train'][target_cols]
x_test = df[df['dataset'] == 'test'][input_cols]
y_test = df[df['dataset'] == 'test'][target_cols]
x_val = df[df['dataset'] == 'val'][input_cols]
y_val = df[df['dataset'] == 'val'][target_cols]
df = df.drop('dataset', axis=1)
x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_val.shape, y_val.shape


In [None]:
from mafese.wrapper.mha import MultiMhaSelector
# Limiting to a selected subset of significant MHAs
selected_mhas = ['OriginalGA', 'OriginalPSO', 'OriginalACO', 'OriginalABC', 'OriginalCS', 'OriginalWOA', 'OriginalGWO']
mha_scores = {}
for mha in selected_mhas:
    print(f'Evaluating MHA: {mha}')
    scores = []
    for target in y_train.columns:
        selector = MultiMhaSelector(problem='regression', obj_name='RMSE', estimator='xgb', list_optimizers=[mha], verbose=False)
        selector.fit(x_train.values, y_train[target].values)
        selected_indices = selector.selected_feature_indexes
        feature_subset = [input_cols[i] for i in selected_indices]
        xgb_x_train = x_train[feature_subset]
        xgb_x_val = x_val[feature_subset]
        model = XGBRegressor(tree_method='hist', device='cuda', n_estimators=100, verbosity=1)
        model.fit(xgb_x_train, y_train[target])
        preds = model.predict(xgb_x_val)
        score = rrmse(y_val[target].values, preds)
        scores.append(score)
    avg_score = np.mean(scores)
    mha_scores[mha] = avg_score
    print(f'{mha} average validation RRMSE: {avg_score}')
print('MHA evaluation results:', mha_scores)
