In [None]:
import json, pathlib, logging, gc, os, numpy as np, pandas as pd, random
import optuna, torch
import seaborn as sns
import matplotlib.pyplot as plt
import datetime


from sklearn.metrics import f1_score,classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.linear_model import LogisticRegression
from huggingface_hub import create_repo, upload_folder, login, notebook_login
from utils_dl import *

In [None]:
SPLITS_PATH = 'new_data/splits'
CONFIG_PATH = "config"
SUBTASK1_PATH = "new_data/subtask1"
CONFIG_JSON       = os.path.join(CONFIG_PATH, "ensemble_config.json" )   
SUBTASK1_CSV  =     os.path.join(SUBTASK1_PATH, 'subtask1_train_rev002.csv') 
VAL_CSV           = os.path.join(SPLITS_PATH, "test_ids.csv")  # test contains the training (never seen before) data.
MODE              = "stack"  # "soft" or "stack"
SEARCH_WEIGHTS    = True     # only for soft mode
WEIGHT_TRIALS     = 30
STACK_FOLDS       = 5
STACK_TRIALS      = 150 
SEED              = 42 # 123 550 101
VAL_OUT_NPY       = "val_preds.npy"
TEST_OUT_NPY      = "test_preds.npy"

TASK1_OUT_PATH = 'new_data/subtask1/submisions'
TASK1_OUT_CSV  = "track_1_predictions.csv"

MODEL_CHECKPOINT = 'Meta-Learner (Logistic Regression)' if MODE == "stack" else MODE

In [None]:
set_global_seed(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
notebook_login() # Log into HF account

In [None]:
cfg = json.loads(pathlib.Path(CONFIG_JSON).read_text())

bundles = [
    ModelBundle(
        name=m["name"],
        path=m["path"],
        device=DEVICE,
        batch_size=m.get("batch_size", 32),
        pad_multiple_of=m.get("pad_multiple_of", 8),
        fixed_max_length=m.get("fixed_max_length"),  
        preprocessing_config=m.get("preprocessing_config")
    )
    for m in cfg["models"]
]

subtask1_df =  pd.read_csv(SUBTASK1_CSV, encoding='utf-8') 

unique_val_ids_df = pd.read_csv(VAL_CSV, encoding='utf-8') 
val_df  = subtask1_df[subtask1_df['id'].isin(unique_val_ids_df['id'])].reset_index(drop=True)


text_col, label_col, group_col = cfg["text_col"], cfg["label_col"], cfg["group_col"]

# labels
unique_labels = sorted(val_df[label_col].unique())
label2id = {lbl: idx for idx, lbl in enumerate(unique_labels)}

id2label = {v: k for k, v in label2id.items()}
val_df[label_col] = val_df[label_col].map(label2id)


In [None]:
# Competition data
test_df  = pd.read_csv(os.path.join(SUBTASK1_PATH, 'track_1_public_test_rev002.csv'), encoding='utf-8')  

### Preprocessing

In [None]:
#  cache per model preprocessed DataFrames, we do it once and reuse in CV and final prediction 
preprocessed_val   = {}
preprocessed_test  = {}
for b in bundles:
    conf = b.preprocessing_config
    for src_df, tgt_dict in [(val_df, preprocessed_val), (test_df, preprocessed_test)]: 
        df_copy = src_df.copy()
        df_copy["lyrics_clean"] = df_copy["lyrics"].apply(
            lambda x: text_preprocess(
                x,
                lang=conf["lang"],
                remove_duplicates=conf["remove_duplicates"],
                cased=conf["cased"],
            )
        )
        tgt_dict[b.name] = df_copy

### Model Learning - Ensemble (Soft Voting and Stack)

In [None]:
# weighted soft-vote
def soft_vote_predict(df, weights=None):
    probs = []
    is_val = df is val_df
    source = preprocessed_val if is_val else preprocessed_test
    label_col = label_col if is_val else None
    for b in bundles:
        lg, _ = b.predict_logits(source[b.name], text_col, label_col)
        probs.append(torch.softmax(torch.tensor(lg), dim=1).numpy())
    probs = np.stack(probs)      # Shape: (num_models, num_samples, num_classes)
    if weights is None:
        weights = np.ones(len(bundles)) / len(bundles) # Equal weight (uniform average)
    probs = probs * weights[:, None, None] # reshapes to  (num_models, 1, 1)
    return probs.sum(axis=0).argmax(axis=1) # Shape becomes: (num_samples, num_classes). Then argmax returns the index of the class with highest probability for each smple

def optimise_soft_weights():    
    def objective(trial):
        # sample raw, unnormalized weights
        raw = np.array([
            trial.suggest_float(f"w{i}", 0.0, 1.0)
            for i in range(len(bundles))
        ])
        # normalize
        weights = raw / raw.sum()

        # soft-vote with normalized weights
        preds = soft_vote_predict(val_df, weights)
        return f1_score(val_df[label_col], preds, average="macro")

    st = optuna.create_study(direction="maximize",
                             sampler=optuna.samplers.TPESampler(seed=SEED))
    st.optimize(objective, n_trials=WEIGHT_TRIALS, show_progress_bar=False)
    best_w = np.array([st.best_params[f"w{i}"] for i in range(len(bundles))])
    print("best soft weights:", np.round(best_w, 4))
    return best_w

def stacked_predict(val_df):
    # Meta-learner
    y      = val_df[label_col].values
    groups = val_df[group_col].values
    sgkf   = StratifiedGroupKFold(n_splits=STACK_FOLDS,
                                  shuffle=True, random_state=SEED)

    # build OOF probability matrix 
    oof = np.zeros((len(val_df), 2 * len(bundles)), dtype=np.float32)
    for _, va in sgkf.split(val_df, y, groups):
        for m_ix, b in enumerate(bundles):
            df_clean = preprocessed_val[b.name].iloc[va]
            logits, _ = b.predict_logits(df_clean, text_col, label_col)
            probs = torch.softmax(torch.tensor(logits), dim=1).numpy()
            oof[va, 2*m_ix : 2*(m_ix+1)] = probs

    # HPO
    def objective(trial):
        solver = trial.suggest_categorical("solver",
                                           ["liblinear", "saga", "lbfgs"])
        if solver == "liblinear":
            penalty = trial.suggest_categorical("pen_lib", ["l2", "l1"])
            l1_ratio = None
        elif solver == "saga":
            penalty = trial.suggest_categorical("pen_saga",
                                                ["l2", "l1", "elasticnet"])
            l1_ratio = (trial.suggest_float("l1_ratio", 0.1, 0.9)
                        if penalty == "elasticnet" else None)
        else:
            penalty, l1_ratio = "l2", None

        C = trial.suggest_float("C", 1e-3, 10.0, log=True)

        if trial.suggest_categorical("cw_type", ["balanced", "custom"]) == "balanced":
            cw = "balanced"
        else:
            w0 = trial.suggest_float("w0", 1.0, 6.0)
            cw = {0: w0, 1: 1.0}

        meta = LogisticRegression(
            solver=solver,
            penalty=penalty,
            C=C,
            l1_ratio=l1_ratio,
            class_weight=cw,
            max_iter=2000,
            random_state=SEED,
        )

        preds = np.zeros_like(y)
        for tr, va in sgkf.split(oof, y, groups):
            meta.fit(oof[tr], y[tr])
            preds[va] = meta.predict(oof[va])    
        return f1_score(y, preds, average="macro")

    study = optuna.create_study(direction="maximize",
                                sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(objective, n_trials=STACK_TRIALS, show_progress_bar=False)
    bp = study.best_params

    # train final meta on full OOF 
    cw_final = ("balanced" if bp["cw_type"] == "balanced"
                else {0: bp["w0"], 1: 1.0})
    final_meta = LogisticRegression(
        solver=bp["solver"],
        penalty=bp.get("pen_lib", bp.get("pen_saga", "l2")),
        C=bp["C"],
        l1_ratio=bp.get("l1_ratio"),
        class_weight=cw_final,
        max_iter=2000,
        random_state=SEED,
    ).fit(oof, y)

    def _meta_predict(df_raw):
        feats = []
        
        is_val = df_raw is val_df        
        src = preprocessed_val if is_val else preprocessed_test
        label_col = 'label' if is_val else None
        
        for b in bundles:
            df_clean = (preprocessed_val if df_raw is val_df
                        else preprocessed_test)[b.name]
            logits, _ = b.predict_logits(df_clean, text_col, label_col)
            feats.append(torch.softmax(torch.tensor(logits), dim=1).numpy())
        feats = np.concatenate(feats, axis=1)
        return final_meta.predict(feats)         

    return _meta_predict, bp


# Run ensemble
if MODE == "soft":
    w_best = optimise_soft_weights() if SEARCH_WEIGHTS else None 
    test_preds = soft_vote_predict(test_df, w_best) # Competition data

elif MODE == "stack":
    meta_predict, best_params_meta = stacked_predict(val_df)
    test_preds = meta_predict(test_df) # Competition data
else:
    raise ValueError("MODE must be 'soft' or 'stack'.")

### Save Predictions

In [None]:
pred_labels = [id2label[i] for i in test_preds]  

submission = pd.DataFrame({
    "id": test_df["id"],       
    "label": pred_labels
})

submission.to_csv(os.path.join(TASK1_OUT_PATH, MODE, TASK1_OUT_CSV) , index=False, encoding="utf-8")
print(f"Saved {TASK1_OUT_CSV} with {len(submission)} rows.")

Saved track_1_predictions.csv with 527 rows.
