# Getting Started
- `conda create -n [env_name]`
- `conda activate [env_name]`
- `pip install -r ./requirements.txt`
- `conda update -all`
- `conda install -c conda-forge xgboost`

In [None]:
import ast
import numpy as np
import pandas as pd
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, cross_validate
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
dataset = pd.read_csv('data/dummy.csv')
df_configurations_with_id_with_duplicates = dataset[['ID', 'configuration']].copy()
df_features_with_id = dataset.drop(['configuration'], axis=1).copy()

In [None]:
# TODO: possibly drop the "or" merging of labels. My assumption is that this makes the vectors too dense.
# TODO: Analyze the passes for mutual exlusivity or other dependencies to include this information in the loss function.
df_configurations_with_id_with_duplicates['config_arr'] = (
    df_configurations_with_id_with_duplicates['configuration']
      .apply(ast.literal_eval)          # "[1,0,1,...]" → [1,0,1,...]
      .apply(lambda lst: np.array(lst, dtype=int))
)
df_ored_configuration = (
    df_configurations_with_id_with_duplicates
      .groupby('ID')['config_arr']
      .agg(lambda arrs: np.bitwise_or.reduce(arrs.tolist()))
      .reset_index()
      .rename(columns={'config_arr':'label_vec'})
)

df_ored_configuration = df_ored_configuration[['ID','label_vec']].copy()

df_features_unique = df_features_with_id.drop_duplicates(subset='ID')

In [None]:
single_value_cols = df_features_unique.columns[df_features_unique.nunique(dropna=False) == 1]

df_feat = df_features_with_id.drop_duplicates('ID').set_index('ID')
df_feat = df_feat.drop(single_value_cols, axis=1)

df_lbl = df_ored_configuration.set_index('ID')
y = np.vstack(df_lbl['label_vec'].values)

X = df_feat.values

In [None]:
# Configuration
DO_HPO          = True
HPO_NUM_TRIALS  = 40
N_OUTER_FOLDS   = 10
N_INNER_FOLDS   = 5
RANDOM_STATE    = 43

BASELINE_PARAMS = {
    "n_estimators": 100,
}

In [None]:

from sklearn.metrics import make_scorer, hamming_loss

base_clf = XGBClassifier(
    **BASELINE_PARAMS,
    objective='binary:logistic',
    random_state=RANDOM_STATE,
    tree_method='hist',
    multi_strategy='multi_output_tree', # maybe without. This allows the model to learn label dependencies by training a single tree for all labels.
)
multiLabelClassifier = MultiOutputClassifier(base_clf, n_jobs=-1)

outer_cv = MultilabelStratifiedKFold(
    n_splits=N_OUTER_FOLDS,
    shuffle=True,
    random_state=RANDOM_STATE,
)

inner_cv = MultilabelStratifiedKFold(
    n_splits=N_INNER_FOLDS,
    shuffle=True,
    random_state=RANDOM_STATE,
)

scoring = {
    "accuracy": "accuracy",
    "f1_micro": "f1_micro",
    "f1_macro": "f1_macro",
    "hamming": make_scorer(hamming_loss, greater_is_better=False)
}

cv_results = cross_validate(
    multiLabelClassifier, X, y,
    cv=outer_cv,
    scoring=scoring,
    return_train_score=False,
    n_jobs=-1
)
# TODO: feature importance study
print("Outer-fold scores:")
for name in scoring:
    mean = cv_results[f"test_{name}"].mean()
    std  = cv_results[f"test_{name}"].std()
    print(f"  {name:>10s}: {mean:7.3f} ± {std:.3f}")

In [None]:
multiLabelClassifier.fit(X, y)

In [None]:
# TODO: Add evaluation code against default optimization routine