# Model K: RandomForestClassifier with All FeaturesThis notebook trains an **RandomForestClassifier** classifier on all available features (regular + all embeddings) with comprehensive preprocessing:- ✅ All regular features- ✅ All embedding families (PCA-compressed)- ✅ Feature scaling (StandardScaler)- ✅ 5-fold Cross-Validation- ✅ Comprehensive Hyperparameter Tuning (Optuna)- ✅ Threshold Fine-tuning- ✅ Model Saving- ✅ Submission.csv Generation- ✅ OOM Safe with aggressive memory management- ✅ SMOTETomek for class imbalance

# 📑 Model K - Code Navigation Index## Quick Navigation- **[Setup](#1-setup)** - Imports, paths, device configuration, robustness utilities- **[Data Loading](#2-data-loading--feature-extraction)** - Load and split features- **[PCA Preprocessing](#3-feature-preprocessing-pca)** - Embedding compression (if applicable)- **[SMOTETomek](#4-class-imbalance-handling-smotetomek)** - Class imbalance resampling- **[Feature Scaling](#5-feature-scaling)** - StandardScaler normalization- **[Cross-Validation](#6-cross-validation--hyperparameter-tuning)** - Hyperparameter optimization- **[Threshold Tuning](#7-threshold-tuning--final-evaluation)** - Optimal threshold finding- **[Model Saving](#8-save-model)** - Save model weights and metadata- **[Submission](#9-generate-submission)** - Generate test predictions## Model Type: RandomForestClassifier (all features)## Key Features✅ GPU-friendly with CPU fallback  ✅ Aggressive garbage collection  ✅ OOM resistant with chunked processing  ✅ Kernel panic resistant (signal handlers, checkpoints)  ✅ Polars-only (no pandas)  ✅ GPU-friendly PCA (IncrementalTorchPCA option)  ✅ SMOTETomek for class imbalance  ✅ Feature scaling & embedding normalization  ✅ Hyperparameter tuning (Optuna/GridSearchCV)  ✅ Fine-grained threshold optimization (120+ thresholds)  ✅ Model weights saved  ✅ Chunked/batched data processing  ## Memory Management- `cleanup_memory()`: Aggressive GC + GPU cache clearing- `check_memory_safe()`: Pre-operation memory checks- `chunked_operation()`: Process large data in chunks- `safe_operation()`: Retry decorator with OOM handling- Signal handlers: SIGINT/SIGTERM for graceful shutdown- Checkpoints: Resume from failures## Device Handling- Automatic GPU detection with CPU fallback- `device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')`- All tensors moved to device explicitly- GPU cache cleared aggressively after operations

## 1. Setup

In [None]:
import os
from pathlib import Path
import random
import gc
import numpy as np
import polars as pl
import torch
from typing import Dict, Optional
import sys
import time
import json
import pickle
import signal
import atexit
from functools import wraps
# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
# Device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
# Paths
current = Path(os.getcwd())
PROJECT_ROOT = current
for _ in range(5):
    if (PROJECT_ROOT / "data").exists():
        break
    PROJECT_ROOT = PROJECT_ROOT.parent
else:
    PROJECT_ROOT = current.parent.parent
MODEL_READY_DIR = PROJECT_ROOT / "data" / "model_ready"
MODEL_SAVE_DIR = PROJECT_ROOT / "models" / "saved_models"
SUBMISSION_DIR = PROJECT_ROOT / "data" / "submission_files"
CHECKPOINT_DIR = PROJECT_ROOT / "data" / "checkpoints" / "modelK"
MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True)
SUBMISSION_DIR.mkdir(parents=True, exist_ok=True)
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
utils_path = PROJECT_ROOT / "src" / "utils"
print("PROJECT_ROOT:", PROJECT_ROOT)
print("MODEL_READY_DIR:", MODEL_READY_DIR)
# Import PCA utilities
USE_TORCH_PCA = False
if utils_path.exists():
    sys.path.insert(0, str(utils_path))
if USE_TORCH_PCA:
    try:
        from pca_utils import IncrementalTorchPCA
        IncrementalPCA = IncrementalTorchPCA
        IS_TORCH_PCA = True
        print("✅ Using PyTorch PCA (GPU-friendly)")
    except ImportError:
        from sklearn.decomposition import IncrementalPCA
        IS_TORCH_PCA = False
        print("⚠️ Using sklearn IncrementalPCA (CPU only)")
else:
    from sklearn.decomposition import IncrementalPCA
    IS_TORCH_PCA = False
    print("✅ Using sklearn IncrementalPCA (memory-efficient)")
# ML libraries
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import f1_score, roc_auc_score, classification_report, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
# Import memory utilities from shared module
try:
    from model_training_utils import cleanup_memory, memory_usage, check_memory_safe
    print("✅ Memory utilities imported from shared module")
except ImportError:
    # Fallback definitions if utils not available
    def cleanup_memory():
        """Aggressive memory cleanup for both CPU and GPU."""
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()
            torch.cuda.ipc_collect()
        gc.collect()
    def memory_usage():
        """Display current memory usage statistics."""
        try:
            import psutil
            process = psutil.Process(os.getpid())
            mem_gb = process.memory_info().rss / 1024**3
            print(f"💾 Memory: {mem_gb:.2f} GB (RAM)", end="")
            if torch.cuda.is_available():
                gpu_mem = torch.cuda.memory_allocated() / 1024**3
                gpu_reserved = torch.cuda.memory_reserved() / 1024**3
                print(f" | {gpu_mem:.2f}/{gpu_reserved:.2f} GB (GPU used/reserved)")
            else:
                print()
        except:
            pass
    def check_memory_safe(ram_threshold_gb=0.85, gpu_threshold=0.80):
        """Check if memory usage is safe for operations."""
        try:
            import psutil
            process = psutil.Process(os.getpid())
            ram_gb = process.memory_info().rss / 1024**3
            total_ram = psutil.virtual_memory().total / 1024**3
            ram_ratio = ram_gb / total_ram if total_ram > 0 else 0
            gpu_ratio = 0
            if torch.cuda.is_available():
                gpu_used = torch.cuda.memory_allocated() / 1024**3
                gpu_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
                gpu_ratio = gpu_used / gpu_total if gpu_total > 0 else 0
            is_safe = ram_ratio < ram_threshold_gb and gpu_ratio < gpu_threshold
            return is_safe, {"ram_gb": ram_gb, "ram_ratio": ram_ratio, "gpu_ratio": gpu_ratio}
        except:
            return True, {}
    print("⚠️ Using fallback memory utilities")
print("PROJECT_ROOT:", PROJECT_ROOT)
memory_usage()
# ============================================================================
# ENHANCED ROBUSTNESS UTILITIES
# ============================================================================
# Global checkpoint state
_checkpoint_state = {
    "pca_complete": False,
    "scaling_complete": False,
    "cv_complete": False,
    "final_model_trained": False,
    "last_saved_checkpoint": None,
}
def save_checkpoint(state_name: str, data: dict, checkpoint_dir: Path = None):
    """Save checkpoint to resume from failures."""
    if checkpoint_dir is None:
        checkpoint_dir = PROJECT_ROOT / "data" / "checkpoints"
    checkpoint_dir.mkdir(parents=True, exist_ok=True)
    checkpoint_path = checkpoint_dir / f"modelK_checkpoint_{state_name}.pkl"
    try:
        with open(checkpoint_path, "wb") as f:
            pickle.dump(data, f)
        _checkpoint_state["last_saved_checkpoint"] = checkpoint_path
        print(f"✅ Checkpoint saved: {checkpoint_path}")
    except Exception as e:
        print(f"⚠️ Failed to save checkpoint: {e}")
def load_checkpoint(state_name: str, checkpoint_dir: Path = None):
    """Load checkpoint to resume from failures."""
    if checkpoint_dir is None:
        checkpoint_dir = PROJECT_ROOT / "data" / "checkpoints"
    checkpoint_path = checkpoint_dir / f"modelK_checkpoint_{state_name}.pkl"
    if checkpoint_path.exists():
        try:
            with open(checkpoint_path, "rb") as f:
                data = pickle.load(f)
            print(f"✅ Checkpoint loaded: {checkpoint_path}")
            return data
        except Exception as e:
            print(f"⚠️ Failed to load checkpoint: {e}")
    return None
def safe_operation(operation_name: str, max_retries: int = 3, checkpoint_on_success: bool = False):
    """Decorator for safe operations with retry and checkpoint support."""
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            for attempt in range(max_retries):
                try:
                    is_safe, mem_info = check_memory_safe(ram_threshold_gb=0.80, gpu_threshold=0.75)
                    if not is_safe:
                        cleanup_memory()
                        if torch.cuda.is_available():
                            torch.cuda.empty_cache()
                        time.sleep(1)
                    result = func(*args, **kwargs)
                    cleanup_memory()
                    if checkpoint_on_success:
                        save_checkpoint(operation_name, {"status": "complete", "result": result})
                    return result
                except (MemoryError, RuntimeError) as e:
                    error_msg = str(e).lower()
                    if "out of memory" in error_msg or "oom" in error_msg:
                        if attempt < max_retries - 1:
                            cleanup_memory()
                            if torch.cuda.is_available():
                                torch.cuda.empty_cache()
                            time.sleep(2)
                            continue
                        else:
                            raise
                    else:
                        raise
                except Exception as e:
                    if attempt < max_retries - 1:
                        cleanup_memory()
                        time.sleep(1)
                        continue
                    else:
                        raise
            return None
        return wrapper
    return decorator
def chunked_operation(
    data,
    operation_func,
    chunk_size: int = 10000,
    progress_every: int = 10,
    operation_name: str = "operation",
):
    """Execute operation on data in chunks with progress tracking."""
    total_chunks = (len(data) + chunk_size - 1) // chunk_size
    results = []
    for i in range(0, len(data), chunk_size):
        chunk_num = i // chunk_size + 1
        chunk = data[i : i + chunk_size]
        try:
            is_safe, mem_info = check_memory_safe(ram_threshold_gb=0.85, gpu_threshold=0.80)
            if not is_safe:
                cleanup_memory()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                time.sleep(0.5)
            chunk_result = operation_func(chunk)
            results.append(chunk_result)
            if chunk_num % progress_every == 0 or chunk_num == total_chunks:
                print(f"  Progress: {chunk_num}/{total_chunks} chunks ({chunk_num*100//total_chunks}%)")
            del chunk
            if chunk_num % 5 == 0:
                cleanup_memory()
        except (MemoryError, RuntimeError) as e:
            error_msg = str(e).lower()
            if "out of memory" in error_msg or "oom" in error_msg:
                cleanup_memory()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                smaller_chunk_size = max(1000, chunk_size // 2)
                if smaller_chunk_size < chunk_size:
                    return chunked_operation(
                        data[i:],
                        operation_func,
                        chunk_size=smaller_chunk_size,
                        progress_every=progress_every,
                        operation_name=operation_name,
                    )
                else:
                    raise
            else:
                raise
    return results
def emergency_cleanup():
    """Emergency cleanup on exit."""
    try:
        cleanup_memory()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        print("✅ Emergency cleanup completed")
    except:
        pass
atexit.register(emergency_cleanup)
def signal_handler(signum, frame):
    """Handle signals for graceful shutdown."""
    print(f"⚠️ Received signal {signum}, saving checkpoint...")
    save_checkpoint("emergency", {"status": "signal_received", "signal": signum})
    emergency_cleanup()
    raise KeyboardInterrupt
try:
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
except:
    pass
print("✅ Enhanced robustness utilities loaded")
def safe_prediction(predict_func, *args, **kwargs):
    """Execute prediction with chunked processing."""
    try:
        is_safe, mem_info = check_memory_safe(ram_threshold_gb=0.85, gpu_threshold=0.80)
        if not is_safe:
            cleanup_memory()
        if "X" in kwargs and len(kwargs["X"]) > 50000:
            X = kwargs["X"]
            chunk_size = 10000
            predictions = []
            for i in range(0, len(X), chunk_size):
                chunk = X[i : i + chunk_size]
                kwargs["X"] = chunk
                chunk_preds = predict_func(*args, **kwargs)
                predictions.append(chunk_preds)
                del chunk, chunk_preds
                if i % (chunk_size * 5) == 0:
                    cleanup_memory()
            return np.concatenate(predictions)
        else:
            return predict_func(*args, **kwargs)
    except (MemoryError, RuntimeError) as e:
        error_msg = str(e).lower()
        if "out of memory" in error_msg or "oom" in error_msg:
            cleanup_memory()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            if "X" in kwargs:
                X = kwargs["X"]
                chunk_size = 5000
                predictions = []
                for i in range(0, len(X), chunk_size):
                    chunk = X[i : i + chunk_size]
                    kwargs["X"] = chunk
                    chunk_preds = predict_func(*args, **kwargs)
                    predictions.append(chunk_preds)
                    del chunk, chunk_preds
                    cleanup_memory()
                return np.concatenate(predictions)
            else:
                raise
        else:
            raise
print("✅ Training robustness wrappers loaded")





## 2. Data Loading & Feature Extraction

In [None]:
def load_parquet_split(split: str) -> pl.DataFrame:
    """Load a model_ready parquet split with error handling."""
    try:
        path = MODEL_READY_DIR / f"{split}_model_ready.parquet"
        if not path.exists():
            alt = MODEL_READY_DIR / f"{split}_model_ready_reduced.parquet"
            if alt.exists():
                path = alt
            else:
                raise FileNotFoundError(f"Could not find {split} data")
        print(f"Loading {split} from {path}")
        return pl.read_parquet(path)
    except Exception as e:
        print(f"❌ Error loading {split}: {e}")
        raise
def split_features_reg_and_all_emb(df: pl.DataFrame):
    """Split features into regular and embedding families."""
    cols = df.columns
    dtypes = df.dtypes
    label = df["label"].to_numpy() if "label" in cols else None
    reg_cols = []
    EMBEDDING_FAMILY_PREFIXES = ["sent_transformer_", "scibert_", "specter_", "specter2_", "ner_"]
    emb_family_to_cols = {p: [] for p in EMBEDDING_FAMILY_PREFIXES}
    NUMERIC_DTYPES = {
        pl.Int8,
        pl.Int16,
        pl.Int32,
        pl.Int64,
        pl.UInt8,
        pl.UInt16,
        pl.UInt32,
        pl.UInt64,
        pl.Float32,
        pl.Float64,
    }
    for c, dt in zip(cols, dtypes):
        if c in ("id", "label"):
            continue
        matched = False
        for p in EMBEDDING_FAMILY_PREFIXES:
            if c.startswith(p):
                emb_family_to_cols[p].append(c)
                matched = True
                break
        if not matched and dt in NUMERIC_DTYPES:
            reg_cols.append(c)
    X_reg = df.select(reg_cols).to_numpy() if reg_cols else None
    X_emb_families = {}
    for p, clist in emb_family_to_cols.items():
        if clist:
            X_emb_families[p] = df.select(clist).to_numpy()
    return X_reg, X_emb_families, label, reg_cols, emb_family_to_cols
# Load data
try:
    train_df = load_parquet_split("train")
    val_df = load_parquet_split("val")
    X_reg_train, X_emb_train_fams, y_train, reg_cols, emb_family_to_cols = (
        split_features_reg_and_all_emb(train_df)
    )
    X_reg_val, X_emb_val_fams, y_val, _, _ = split_features_reg_and_all_emb(val_df)
    print(f"\n📊 Data Summary:")
    print(f"  Regular features: {len(reg_cols)}")
    for fam, arr in X_emb_train_fams.items():
        print(f"  Embedding {fam}: {arr.shape[1]} dims")
    print(
        f"  Train samples: {len(y_train)}, Positive: {y_train.sum()}, Negative: {(y_train==0).sum()}"
    )
    print(f"  Val samples: {len(y_val)}, Positive: {y_val.sum()}, Negative: {(y_val==0).sum()}")
    del train_df, val_df
    cleanup_memory()
    memory_usage()
except Exception as e:
    print(f"❌ Error loading data: {e}")
    raise


## 3. Feature Preprocessing: PCA

In [None]:
# PCA compression per embedding family
PCA_COMPONENTS_PER_FAMILY = {
    "sent_transformer_": 32,
    "scibert_": 32,
    "specter_": 32,
    "specter2_": 32,
    "ner_": 16,
}
@safe_operation("pca", max_retries=3, checkpoint_on_success=True)
def apply_pca_to_embeddings(
    X_emb_fams: Dict[str, np.ndarray], fit_on_train: bool = True, pca_models: Optional[Dict] = None
):
    """Apply IncrementalPCA to each embedding family (GPU-friendly, OOM-resistant)."""
    X_emb_pca_list = []
    new_pca_models = {}
    cleanup_memory()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    is_safe, mem_info = check_memory_safe(ram_threshold_gb=0.75, gpu_threshold=0.70)
    if not is_safe:
        print(f"⚠️ Memory usage high before PCA: {mem_info}")
        cleanup_memory()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    for fam, X_emb in X_emb_fams.items():
        n_components = PCA_COMPONENTS_PER_FAMILY.get(fam, 32)
        try:
            cleanup_memory()
            if fit_on_train or pca_models is None:
                if IS_TORCH_PCA:
                    ipca = IncrementalPCA(
                        n_components=min(n_components, X_emb.shape[1]),
                        batch_size=2000,
                        device=device,
                    )
                else:
                    ipca = IncrementalPCA(
                        n_components=min(n_components, X_emb.shape[1]), batch_size=2000
                    )
                max_pca_rows = int(X_emb.shape[0] * 0.3)
                if X_emb.shape[0] > max_pca_rows:
                    print(
                        f"  Fitting PCA on subset ({max_pca_rows}/{X_emb.shape[0]} samples) for {fam}"
                    )
                    idx = np.random.choice(X_emb.shape[0], size=max_pca_rows, replace=False)
                    X_emb_subset = X_emb[idx].copy()
                    del idx
                    cleanup_memory()
                    ipca.fit(X_emb_subset)
                    del X_emb_subset
                    cleanup_memory()
                else:
                    X_emb_copy = X_emb.copy() if X_emb.flags["OWNDATA"] == False else X_emb
                    ipca.fit(X_emb_copy)
                    if X_emb_copy is not X_emb:
                        del X_emb_copy
                    cleanup_memory()
                new_pca_models[fam] = ipca
            else:
                ipca = pca_models[fam]
            chunk_size = 5000
            if X_emb.shape[0] > chunk_size:
                X_emb_pca_chunks = []
                for i in range(0, X_emb.shape[0], chunk_size):
                    chunk = X_emb[i : i + chunk_size].copy()
                    chunk_pca = ipca.transform(chunk)
                    X_emb_pca_chunks.append(chunk_pca)
                    del chunk, chunk_pca
                    cleanup_memory()
                    if i % (chunk_size * 5) == 0 and torch.cuda.is_available():
                        torch.cuda.empty_cache()
                X_emb_pca = np.vstack(X_emb_pca_chunks)
                del X_emb_pca_chunks
                cleanup_memory()
            else:
                X_emb_copy = X_emb.copy() if X_emb.flags["OWNDATA"] == False else X_emb
                X_emb_pca = ipca.transform(X_emb_copy)
                if X_emb_copy is not X_emb:
                    del X_emb_copy
                cleanup_memory()
            X_emb_pca_list.append(X_emb_pca)
            del X_emb_pca
            cleanup_memory()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
        except RuntimeError as e:
            if "out of memory" in str(e).lower() or "OOM" in str(e).upper():
                print(f"❌ OOM error processing {fam}")
                cleanup_memory()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize()
                raise
            else:
                raise
    cleanup_memory()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    if X_emb_pca_list:
        X_emb_combined = np.hstack(X_emb_pca_list)
    else:
        X_emb_combined = None
    return X_emb_combined, new_pca_models if fit_on_train else pca_models
# Apply IncrementalPCA to embeddings
try:
    cleanup_memory()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    print("\n📊 Applying IncrementalPCA to embedding families...")
    for fam in X_emb_train_fams.keys():
        n_comp = PCA_COMPONENTS_PER_FAMILY.get(fam, 32)
        print(f"  {fam}: {X_emb_train_fams[fam].shape[1]} dims → {n_comp} components")
    X_emb_train_pca, pca_models_train = apply_pca_to_embeddings(X_emb_train_fams, fit_on_train=True)
    cleanup_memory()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    X_emb_val_pca, _ = apply_pca_to_embeddings(
        X_emb_val_fams, fit_on_train=False, pca_models=pca_models_train
    )
    print(f"\n📊 After IncrementalPCA:")
    print(f"  Train embeddings: {X_emb_train_pca.shape}")
    print(f"  Val embeddings: {X_emb_val_pca.shape}")
    # Combine regular + embeddings
    if X_reg_train is not None:
        X_train = np.hstack([X_reg_train, X_emb_train_pca])
        X_val = np.hstack([X_reg_val, X_emb_val_pca])
    else:
        X_train = X_emb_train_pca
        X_val = X_emb_val_pca
    print(f"  Combined train: {X_train.shape}")
    print(f"  Combined val: {X_val.shape}")
    del X_reg_train, X_reg_val, X_emb_train_fams, X_emb_val_fams, X_emb_train_pca, X_emb_val_pca
    cleanup_memory()
    memory_usage()
except Exception as e:
    print(f"❌ Error in PCA: {e}")
    raise


## 4. Class Imbalance Handling: SMOTETomek

In [None]:
from imblearn.combine import SMOTETomek
print("\n📊 Checking class imbalance and applying SMOTETomek resampling...")
print(f"  Before: {len(X_train)} samples, Positive: {y_train.sum()}, Negative: {(y_train==0).sum()}")
print(f"  Imbalance ratio: {(y_train==0).sum() / max(y_train.sum(), 1):.2f}:1")
try:
    smt = SMOTETomek(random_state=42, sampling_strategy="auto", n_jobs=-1)
    X_train_resampled, y_train_resampled = smt.fit_resample(X_train, y_train)
    print(f"  After: {len(X_train_resampled)} samples, Positive: {y_train_resampled.sum()}, Negative: {(y_train_resampled==0).sum()}")
    print(f"  Balance ratio: {(y_train_resampled==0).sum() / max(y_train_resampled.sum(), 1):.2f}:1")
    X_train = X_train_resampled
    y_train = y_train_resampled
    del X_train_resampled, y_train_resampled
    cleanup_memory()
except Exception as e:
    print(f"  ⚠️ SMOTETomek failed: {e}")
    print("  Continuing with original training data...")
    cleanup_memory()


## 5. Feature Scaling

In [None]:
print("\n📊 Applying Feature Scaling to combined features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_train = X_train_scaled
X_val = X_val_scaled
del X_train_scaled, X_val_scaled
cleanup_memory()
print("  ✅ Scaling complete!")
memory_usage()


## 6. Cross-Validation & Hyperparameter Tuning

In [None]:
# Combine train and val for CV
X_full = np.vstack([X_train, X_val])
y_full = np.hstack([y_train, y_val])
print(f"\n📊 Full dataset for CV: {X_full.shape}, labels: {y_full.shape}")
print(f"  Positive samples: {y_full.sum()}, Negative: {(y_full==0).sum()}")
# Setup 5-fold CV
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
# XGBoost hyperparameter grid
XGB_PARAM_GRID = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 0.9, 1.0],
    "colsample_bytree": [0.8, 0.9, 1.0],
    "min_child_weight": [1, 3, 5],
    "gamma": [0, 0.1, 0.2],
    "reg_alpha": [0, 0.1, 0.5],
    "reg_lambda": [1, 1.5, 2.0],
    "scale_pos_weight": [1, (y_full == 0).sum() / max((y_full == 1).sum(), 1)],
}
# Use RandomizedSearchCV for faster tuning
USE_RANDOMIZED_SEARCH = True
N_ITER_RANDOM = 50
print(f"\n🔍 Hyperparameter tuning:")
print(f"  Method: {'RandomizedSearchCV' if USE_RANDOMIZED_SEARCH else 'GridSearchCV'}")
print(f"  CV folds: {N_FOLDS}")
if USE_RANDOMIZED_SEARCH:
    print(f"  Random iterations: {N_ITER_RANDOM}")
cleanup_memory()
memory_usage()


In [None]:
# Optuna hyperparameter tuning
best_params = None
best_cv_score = 0.0
def objective(trial):
    """Optuna objective function for RandomForestClassifier hyperparameter tuning."""
    # TODO: Implement model-specific hyperparameter space
    # For now, use default parameters
    model = RandomForestClassifier(random_state=SEED)
    cv_scores = []
    for train_idx, val_idx in skf.split(X_full, y_full):
        X_cv_train, X_cv_val = X_full[train_idx], X_full[val_idx]
        y_cv_train, y_cv_val = y_full[train_idx], y_full[val_idx]
        model.fit(X_cv_train, y_cv_train)
        y_pred = model.predict(X_cv_val)
        f1 = f1_score(y_cv_val, y_pred, pos_label=1, zero_division=0)
        cv_scores.append(f1)
        cleanup_memory()
    return np.mean(cv_scores)
try:
    print("\n" + "=" * 80)
    print("Starting Hyperparameter Tuning with Optuna")
    print("=" * 80)
    study = optuna.create_study(
        direction="maximize",
        sampler=TPESampler(seed=SEED),
        study_name="modelk_randomforestclassifier"
    )
    # Load checkpoint if exists
    checkpoint_data = load_checkpoint("optuna_study")
    if checkpoint_data and "study" in checkpoint_data:
        print("  Resuming from checkpoint...")
        study = checkpoint_data["study"]
    start_time = time.time()
    study.optimize(objective, n_trials=50, timeout=3600, show_progress_bar=True)
    elapsed_time = time.time() - start_time
    best_params = study.best_params
    best_cv_score = study.best_value
    # Save checkpoint
    save_checkpoint("optuna_study", {"study": study, "best_params": best_params, "best_cv_score": best_cv_score})
    print(f"\n✅ Hyperparameter tuning complete ({elapsed_time/60:.1f} min)")
    print(f"  Best CV F1: {best_cv_score:.4f}")
    print(f"  Best parameters:")
    for key, value in best_params.items():
        print(f"    {key}: {value}")
    cleanup_memory()
    memory_usage()
except Exception as e:
    print(f"❌ Error in hyperparameter tuning: {e}")
    print("⚠️ Using default parameters...")
    best_params = {}
    best_cv_score = 0.0


## 7. Threshold Tuning & Final Evaluation

In [None]:
# Train final model on full data
try:
    print("\n" + "=" * 80)
    print("Training Final Model on Full Dataset")
    print("=" * 80)
    # Use best parameters or defaults
    final_model = RandomForestClassifier(
        **best_params,
        random_state=SEED,
        n_jobs=-1,
    )
    final_model.fit(X_full, y_full)
    # Get predictions on validation set (original split)
    y_val_proba = safe_prediction(final_model.predict_proba, X=X_val)[:, 1]
    # Find optimal threshold using precision-recall curve
    precision, recall, pr_thresholds = precision_recall_curve(y_val, y_val_proba)
    f1_scores_pr = 2 * (precision * recall) / (precision + recall + 1e-10)
    best_pr_idx = np.argmax(f1_scores_pr)
    best_pr_threshold = pr_thresholds[best_pr_idx] if best_pr_idx < len(pr_thresholds) else 0.5
    best_pr_f1 = f1_scores_pr[best_pr_idx]
    # Manual fine-grained search in optimal region
    thresholds = np.concatenate([
        np.linspace(0.01, 0.05, 20),
        np.linspace(0.05, 0.15, 50),
        np.linspace(0.15, 0.3, 30),
        np.linspace(0.3, 0.9, 20)
    ])
    best_threshold = best_pr_threshold
    best_f1 = best_pr_f1
    for thr in thresholds:
        y_pred = (y_val_proba >= thr).astype(int)
        f1 = f1_score(y_val, y_pred, pos_label=1, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = thr
    print(f"\n✅ Final Optimal Threshold: {best_threshold:.4f}")
    print(f"✅ Final Validation F1: {best_f1:.4f}")
    # Classification report
    y_val_pred = (y_val_proba >= best_threshold).astype(int)
    print("\n📊 Classification Report:")
    print(classification_report(y_val, y_val_pred, digits=4, zero_division=0))
    cleanup_memory()
    memory_usage()
except Exception as e:
    print(f"❌ Error in final training: {e}")
    raise



## 8. Save Model

In [None]:
# Save model
try:
    model_save_path = MODEL_SAVE_DIR / "modelK_randomforestclassifier_all_features_best.pkl"
    save_dict = {
        "model": final_model,
        "scaler": scaler if "scaler" in locals() else None,
        "pca_models": pca_models_train if "pca_models_train" in locals() else None,
        "best_params": best_params,
        "best_cv_score": best_cv_score,
        "best_threshold": best_threshold,
        "best_f1": best_f1,
        "reg_cols": reg_cols,
        "emb_family_to_cols": emb_family_to_cols,
    }
    with open(model_save_path, "wb") as f:
        pickle.dump(save_dict, f)
    print(f"\n💾 Model saved to: {model_save_path}")
except Exception as e:
    print(f"❌ Error saving model: {e}")



## 9. Generate Submission

In [None]:
import redef extract_work_id(id_value: str) -> str:
    """Extract work_id from URL or return as is if already just ID."""
    if isinstance(id_value, str) and id_value.startswith('W') and len(id_value) > 1 and '/' not in id_value:
        return id_value
    id_str = str(id_value)
    match = re.search(r'W\d+', id_str)
    if match:
        return match.group(0)
    return id_str

    """Extract work_id from URL or return as is if already just ID."""    if id_value.startswith('W') and len(id_value) > 1 and '/' not in id_value:        return id_value    match = re.search(r'W\d+', id_value)    if match:        return match.group(0)    return id_value# Load test data and generate predictionstry:    print("\n" + "=" * 80)    print("Generating Test Predictions")    print("=" * 80)    test_df = load_parquet_split("test")    test_ids = test_df["id"].to_numpy()    # Process test data same as train    X_reg_test, X_emb_test_fams, _, _, _ = split_features_reg_and_all_emb(test_df)    del test_df    # Apply PCA    X_emb_test_pca, _ = apply_pca_to_embeddings(        X_emb_test_fams, fit_on_train=False, pca_models=pca_models_train    )    # Combine    if X_reg_test is not None:        X_test = np.hstack([X_reg_test, X_emb_test_pca])    else:        X_test = X_emb_test_pca    del X_reg_test, X_emb_test_fams, X_emb_test_pca    cleanup_memory()    # Scale    if "scaler" in locals():        X_test = scaler.transform(X_test)    # Predict in chunks for OOM protection    chunk_size = 10000    if X_test.shape[0] > chunk_size:        print(f"  Predicting in chunks (size={chunk_size}) for OOM protection...")        y_test_proba_chunks = []        for i in range(0, X_test.shape[0], chunk_size):            chunk_proba = safe_prediction(final_model.predict_proba, X=X_test[i : i + chunk_size])[:, 1]            y_test_proba_chunks.append(chunk_proba)            del chunk_proba            cleanup_memory()        y_test_proba = np.concatenate(y_test_proba_chunks)        del y_test_proba_chunks    else:        y_test_proba = safe_prediction(final_model.predict_proba, X=X_test)[:, 1]    y_test_pred = (y_test_proba >= best_threshold).astype(int)    # Create submission using Polars    work_ids = np.array([extract_work_id(str(id_val)) for id_val in test_ids])    submission_df = pl.DataFrame({"work_id": work_ids, "label": y_test_pred})    submission_path = SUBMISSION_DIR / "submission_modelk.csv"    submission_df.write_csv(submission_path)    print(f"\n✅ Submission saved to: {submission_path}")    print(f"  Test predictions: {len(y_test_pred)}, Positive: {y_test_pred.sum()}, Negative: {(y_test_pred==0).sum()}")    cleanup_memory()    memory_usage()except Exception as e:    print(f"❌ Error generating submission: {e}")    raise