In [21]:
import os
import time
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

from lightgbm import LGBMClassifier

In [23]:
# 1. Load data
train = pd.read_csv("../gen/data_def_train_folds.csv")
test  = pd.read_csv("../gen/data_def_test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

# 2. Create combined column for Phase 2 (intrinsic + extrinsic)
train["labels_1_2_intr_extr"] = (
    train[["labels_1_intrinsic", "labels_2_extrinsic"]]
    .fillna("")
    .agg("|".join, axis=1)
    .str.strip("|")
)

test["labels_1_2_intr_extr"] = (
    test[["labels_1_intrinsic", "labels_2_extrinsic"]]
    .fillna("")
    .agg("|".join, axis=1)
    .str.strip("|")
)

# 3. Target – NOVA group
target = "nova_group"

le = LabelEncoder()
y_train = le.fit_transform(train[target])
y_test = le.transform(test[target])

print("Class mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# 4. Phases and best hyperparameters (per phase)
phases = {
    "phase1_intrinsic": "labels_1_intrinsic",
    "phase2_intr+extr": "labels_1_2_intr_extr",
    "phase3_all_labels": "labels_string",
}

best_params_by_phase = {
    "phase1_intrinsic": {
        "learning_rate": 0.1,
        "max_depth": 5,
        "n_estimators": 300,
        "num_leaves": 30,
    },
    "phase2_intr+extr": {
        "learning_rate": 0.1,
        "max_depth": 5,
        "n_estimators": 300,
        "num_leaves": 30,
    },
    "phase3_all_labels": {
        "learning_rate": 0.1,
        "max_depth": 5,
        "n_estimators": 300,
        "num_leaves": 50,
    },
}

base_lgb_params = {
    "objective": "multiclass",
    "random_state": 42,
    "n_jobs": -1,
    "verbosity": -1,
    "force_row_wise": True,
}

# 5. Containers for results
all_metrics = []
coverage_stats = [] 
feature_importances = []

# 6. Loop over phases – fit/evaluate only
for phase_name, label_col in phases.items():
    print("\n" + "="*90)
    print(f"=== {phase_name} – Using column '{label_col}' ===")
    print("="*90)

    # 6.1 Encode label features
    train[label_col] = train[label_col].fillna("")
    test[label_col]  = test[label_col].fillna("")

    train_labels = train[label_col].apply(lambda x: x.split("|") if x != "" else [])
    test_labels  = test[label_col].apply(lambda x: x.split("|") if x != "" else [])

    mlb = MultiLabelBinarizer(sparse_output=True)
    X_train_labels = mlb.fit_transform(train_labels).astype(np.float32)
    X_test_labels  = mlb.transform(test_labels).astype(np.float32)

    print(f"Encoded label matrix shape (train): {X_train_labels.shape}")
    print(f"Encoded label matrix shape (test):  {X_test_labels.shape}")

    # ---- PhaseCoverage masks (at least 1 label in this phase)
    train_nonempty_mask = np.array(X_train_labels.sum(axis=1)).ravel() > 0
    test_nonempty_mask  = np.array(X_test_labels.sum(axis=1)).ravel() > 0

    # Basic counts for coverage
    train_total = len(y_train)
    test_total  = len(y_test)
    core_mask   = test["core_slice"].astype(bool).values
    core_total  = int(core_mask.sum())

    train_cov_n = int(train_nonempty_mask.sum())
    test_cov_n  = int(test_nonempty_mask.sum())
    core_cov_n  = int((test_nonempty_mask & core_mask).sum())

    # Store coverage stats
    coverage_stats.append({
        "Phase": phase_name,
        "Dataset": "Train",
        "n_total": train_total,
        "n_with_labels": train_cov_n,
        "frac_with_labels": train_cov_n / train_total if train_total > 0 else np.nan,
    })
    coverage_stats.append({
        "Phase": phase_name,
        "Dataset": "Test",
        "n_total": test_total,
        "n_with_labels": test_cov_n,
        "frac_with_labels": test_cov_n / test_total if test_total > 0 else np.nan,
    })
    coverage_stats.append({
        "Phase": phase_name,
        "Dataset": "Test-Core",
        "n_total": core_total,
        "n_with_labels": core_cov_n,
        "frac_with_labels": core_cov_n / core_total if core_total > 0 else np.nan,
    })

    print(
        f"Coverage – {phase_name}:\n"
        f"  Train:     {train_cov_n}/{train_total} "
        f"({train_cov_n/train_total:.3%} with ≥1 label in this phase)\n"
        f"  Test:      {test_cov_n}/{test_total} "
        f"({test_cov_n/test_total:.3%} with ≥1 label in this phase)\n"
        f"  Test-Core: {core_cov_n}/{core_total} "
        f"({core_cov_n/core_total:.3%} with ≥1 label in this phase)"
    )

    # 6.2 Fit LightGBM with best params for this phase
    lgb_params = {**base_lgb_params, **best_params_by_phase[phase_name]}
    lgb = LGBMClassifier(**lgb_params)

    fit_start = time.perf_counter()
    lgb.fit(X_train_labels, y_train)  # train on ALL rows
    fit_end = time.perf_counter()
    fit_time = fit_end - fit_start

    print(f"Fitted LightGBM for {phase_name} in {fit_time:.2f} s")

    # 6.3 Feature importance for this phase
    feature_names = mlb.classes_

    # Split-based importance (default)
    imp_split = lgb.feature_importances_.astype(float)

    # Gain-based importance via booster
    booster = lgb.booster_
    imp_gain = booster.feature_importance(importance_type="gain").astype(float)

    fi_df = pd.DataFrame({
        "feature": feature_names,
        "importance_split": imp_split,
        "importance_gain": imp_gain,
        "Phase": phase_name,
    })

    # Normalized importance
    fi_df["importance_split_norm"] = fi_df["importance_split"] / fi_df["importance_split"].sum()
    fi_df["importance_gain_norm"]  = fi_df["importance_gain"]  / fi_df["importance_gain"].sum()

    # Save per-phase feature importance
    os.makedirs("../results/feature_importance", exist_ok=True)
    fi_path = f"../results/feature_importance/lgb_nova_feature_importance_{phase_name}.csv"
    fi_df.to_csv(fi_path, sep=";", index=False)
    print(f"Saved feature importance for {phase_name} → {fi_path}")

    feature_importances.append(fi_df)

    # 6.4 Predictions: Train–Overall, Test–Overall, Test–Core
    # Train–Overall
    y_pred_train = lgb.predict(X_train_labels)

    # Test–Overall
    y_pred_test = lgb.predict(X_test_labels)

    # Test–Core subset
    y_test_core = y_test[core_mask]
    y_pred_test_core = y_pred_test[core_mask]

    # PhaseCoverage subsets (at least 1 label in this phase)
    y_train_cov = y_train[train_nonempty_mask]
    y_pred_train_cov = y_pred_train[train_nonempty_mask]

    y_test_cov = y_test[test_nonempty_mask]
    y_pred_test_cov = y_pred_test[test_nonempty_mask]

    # 6.5 Attach predictions back to DataFrames (model-specific cols)
    train[f"{phase_name}_lgb_pred_enc"] = y_pred_train
    test[f"{phase_name}_lgb_pred_enc"] = y_pred_test

    train[f"{phase_name}_lgb_pred"] = le.inverse_transform(y_pred_train)
    test[f"{phase_name}_lgb_pred"] = le.inverse_transform(y_pred_test)

    # 6.6 Compute metrics
    def compute_metrics(y_true, y_pred, phase, dataset_name):
        if len(y_true) == 0:
            return {
                "Phase": phase,
                "Dataset": dataset_name,
                "n_samples": 0,
                "Accuracy": np.nan,
                "Balanced Accuracy": np.nan,
                "Precision (Macro)": np.nan,
                "Recall (Macro)": np.nan,
                "F1 (Macro)": np.nan,
                "F1 (Micro)": np.nan,
            }

        return {
            "Phase": phase,
            "Dataset": dataset_name,
            "n_samples": len(y_true),
            "Accuracy": accuracy_score(y_true, y_pred),
            "Balanced Accuracy": balanced_accuracy_score(y_true, y_pred),
            "Precision (Macro)": precision_score(y_true, y_pred, average="macro", zero_division=0),
            "Recall (Macro)": recall_score(y_true, y_pred, average="macro", zero_division=0),
            "F1 (Macro)": f1_score(y_true, y_pred, average="macro", zero_division=0),
            "F1 (Micro)": f1_score(y_true, y_pred, average="micro", zero_division=0),
        }

    # Train metrics
    all_metrics.append(compute_metrics(y_train, y_pred_train, phase_name, "Train-Overall"))
    all_metrics.append(compute_metrics(y_train_cov, y_pred_train_cov, phase_name, "Train-PhaseCoverage"))
    # Test metrics
    all_metrics.append(compute_metrics(y_test, y_pred_test, phase_name, "Test-Overall"))
    all_metrics.append(compute_metrics(y_test_cov,  y_pred_test_cov,  phase_name, "Test-PhaseCoverage"))
    all_metrics.append(compute_metrics(y_test_core, y_pred_test_core, phase_name, "Test-Core"))

    # Print quick summary for this phase (last 5 entries)
    print("\nMetrics –", phase_name)
    for m in all_metrics[-5:]:
        print(
            f"{m['Dataset']}: n={m['n_samples']}, "
            f"Acc={m['Accuracy']:.4f}, "
            f"BalAcc={m['Balanced Accuracy']:.4f}, "
            f"F1_macro={m['F1 (Macro)']:.4f}"
        )

# 7. Combine and inspect metrics across phases & datasets
metrics_df = pd.DataFrame(all_metrics)
coverage_df = pd.DataFrame(coverage_stats)

print("\n=== All Metrics (Train-Overall, Test-Overall, Test-Core, PhaseCoverage) ===")
print(metrics_df)

print("\n=== Coverage stats per phase & dataset ===")
print(coverage_df)

# 8. Save outputs
metrics_df.to_csv("../results/lgb_nova_eval_metrics_train_test_core.csv", sep=";", index=False)
coverage_df.to_csv("../results/lgb_nova_phase_coverage_stats.csv", sep=";", index=False)
train.to_csv("../results/data_def_train_with_lgb_nova_preds.csv", sep=";", index=False)
test.to_csv("../results/data_def_test_with_lgb_nova_preds.csv", sep=";", index=False)

print("\nFiles written to ./results/:")
print(" - lgb_nova_eval_metrics_train_test_core.csv")
print(" - lgb_nova_phase_coverage_stats.csv")
print(" - data_def_train_with_lgb_nova_preds.csv")
print(" - data_def_test_with_lgb_nova_preds.csv")

# 9. Combined feature-importance file across phases
if feature_importances:
    fi_all = pd.concat(feature_importances, ignore_index=True)
    fi_all.to_csv("../results/feature_importance/lgb_nova_feature_importance_all_phases.csv",
                  sep=";", index=False)
    print("Saved combined feature importance for all phases → "
          "../results/feature_importance/lgb_nova_feature_importance_all_phases.csv")

  train = pd.read_csv("../gen/data_def_train_folds.csv")
  test  = pd.read_csv("../gen/data_def_test.csv")


Train shape: (206068, 24)
Test shape: (88315, 23)
Class mapping: {1: 0, 2: 1, 3: 2, 4: 3}

=== phase1_intrinsic – Using column 'labels_1_intrinsic' ===




Encoded label matrix shape (train): (206068, 359)
Encoded label matrix shape (test):  (88315, 359)
Coverage – phase1_intrinsic:
  Train:     80574/206068 (39.101% with ≥1 label in this phase)
  Test:      34289/88315 (38.826% with ≥1 label in this phase)
  Test-Core: 4875/4876 (99.979% with ≥1 label in this phase)
Fitted LightGBM for phase1_intrinsic in 2.53 s
Saved feature importance for phase1_intrinsic → ../results/feature_importance/lgb_nova_feature_importance_phase1_intrinsic.csv





Metrics – phase1_intrinsic
Train-Overall: n=206068, Acc=0.5679, BalAcc=0.2835, F1_macro=0.2488
Train-PhaseCoverage: n=80574, Acc=0.6631, BalAcc=0.3725, F1_macro=0.3827
Test-Overall: n=88315, Acc=0.5676, BalAcc=0.2842, F1_macro=0.2502
Test-PhaseCoverage: n=34289, Acc=0.6631, BalAcc=0.3786, F1_macro=0.3920
Test-Core: n=4876, Acc=0.6852, BalAcc=0.4113, F1_macro=0.4299

=== phase2_intr+extr – Using column 'labels_1_2_intr_extr' ===




Encoded label matrix shape (train): (206068, 808)
Encoded label matrix shape (test):  (88315, 808)
Coverage – phase2_intr+extr:
  Train:     185867/206068 (90.197% with ≥1 label in this phase)
  Test:      79571/88315 (90.099% with ≥1 label in this phase)
  Test-Core: 4876/4876 (100.000% with ≥1 label in this phase)
Fitted LightGBM for phase2_intr+extr in 2.98 s
Saved feature importance for phase2_intr+extr → ../results/feature_importance/lgb_nova_feature_importance_phase2_intr+extr.csv





Metrics – phase2_intr+extr
Train-Overall: n=206068, Acc=0.6223, BalAcc=0.3818, F1_macro=0.3934
Train-PhaseCoverage: n=185867, Acc=0.6237, BalAcc=0.3931, F1_macro=0.4058
Test-Overall: n=88315, Acc=0.6178, BalAcc=0.3779, F1_macro=0.3887
Test-PhaseCoverage: n=79571, Acc=0.6191, BalAcc=0.3890, F1_macro=0.4012
Test-Core: n=4876, Acc=0.7262, BalAcc=0.4526, F1_macro=0.4734

=== phase3_all_labels – Using column 'labels_string' ===




Encoded label matrix shape (train): (206068, 870)
Encoded label matrix shape (test):  (88315, 870)
Coverage – phase3_all_labels:
  Train:     206068/206068 (100.000% with ≥1 label in this phase)
  Test:      88302/88315 (99.985% with ≥1 label in this phase)
  Test-Core: 4876/4876 (100.000% with ≥1 label in this phase)
Fitted LightGBM for phase3_all_labels in 3.15 s
Saved feature importance for phase3_all_labels → ../results/feature_importance/lgb_nova_feature_importance_phase3_all_labels.csv





Metrics – phase3_all_labels
Train-Overall: n=206068, Acc=0.6237, BalAcc=0.3841, F1_macro=0.3968
Train-PhaseCoverage: n=206068, Acc=0.6237, BalAcc=0.3841, F1_macro=0.3968
Test-Overall: n=88315, Acc=0.6187, BalAcc=0.3796, F1_macro=0.3913
Test-PhaseCoverage: n=88302, Acc=0.6187, BalAcc=0.3797, F1_macro=0.3913
Test-Core: n=4876, Acc=0.7313, BalAcc=0.4724, F1_macro=0.5019

=== All Metrics (Train-Overall, Test-Overall, Test-Core, PhaseCoverage) ===
                Phase              Dataset  n_samples  Accuracy  \
0    phase1_intrinsic        Train-Overall     206068  0.567919   
1    phase1_intrinsic  Train-PhaseCoverage      80574  0.663092   
2    phase1_intrinsic         Test-Overall      88315  0.567650   
3    phase1_intrinsic   Test-PhaseCoverage      34289  0.663070   
4    phase1_intrinsic            Test-Core       4876  0.685193   
5    phase2_intr+extr        Train-Overall     206068  0.622280   
6    phase2_intr+extr  Train-PhaseCoverage     185867  0.623666   
7    phase2_intr