# Model 5: XGBoost with All Features + Scaling + Normalization + t-SNE

This notebook trains an **XGBoost** classifier on all available features (regular + all embeddings) with comprehensive preprocessing:
- ‚úÖ All regular features (54)
- ‚úÖ All embedding families (PCA-compressed)
- ‚úÖ Feature scaling (StandardScaler/RobustScaler)
- ‚úÖ Feature normalization (MinMaxScaler)
- ‚úÖ Optional t-SNE dimensionality reduction
- ‚úÖ 5-fold Cross-Validation
- ‚úÖ Comprehensive Hyperparameter Tuning (GridSearchCV/RandomizedSearchCV)
- ‚úÖ Threshold Fine-tuning
- ‚úÖ Model Saving
- ‚úÖ Submission.csv Generation
- ‚úÖ OOM Safe with aggressive memory management
- ‚úÖ Robust error handling (dead kernels, panics, warnings)

## Memory & Robustness Notes

**Memory Optimizations Applied:**
- ‚úÖ Aggressive garbage collection after data loading
- ‚úÖ Explicit deletion of DataFrames/arrays after conversion
- ‚úÖ Chunked processing for large datasets
- ‚úÖ Periodic memory cleanup during training
- ‚úÖ Memory usage monitoring
- ‚úÖ Safe memory checks before operations

**Robustness Features:**
- ‚úÖ Try-except blocks around all critical operations
- ‚úÖ Graceful degradation on errors
- ‚úÖ Checkpoint saving (resume from failures)
- ‚úÖ Warning suppression for cleaner output
- ‚úÖ Memory-safe operations
- ‚úÖ Progress tracking with timeouts


## 1. Setup


In [1]:
import os
import sys
import warnings
import gc
import time
import json
import pickle
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any
import random

import numpy as np
import polars as pl

# Suppress warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, roc_auc_score, classification_report

import xgboost as xgb
from xgboost import XGBClassifier

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# PyTorch-based PCA (GPU-friendly with CPU fallback) - same as models 1-4
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# Paths (adapt if your project structure differs)
# Try to find project root by looking for 'data' directory
current = Path(os.getcwd())
PROJECT_ROOT = current
# Go up directories until we find one with 'data' subdirectory
for _ in range(5):  # Max 5 levels up
    if (PROJECT_ROOT / 'data').exists():
        break
    PROJECT_ROOT = PROJECT_ROOT.parent
else:
    # Fallback: assume we're in src/notebooks, go up 2 levels
    PROJECT_ROOT = current.parent.parent

MODEL_READY_DIR = PROJECT_ROOT / 'data' / 'model_ready'
MODEL_SAVE_DIR = PROJECT_ROOT / 'models' / 'saved_models'
SUBMISSION_DIR = PROJECT_ROOT / 'data' / 'submission_files'
MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True)
SUBMISSION_DIR.mkdir(parents=True, exist_ok=True)
utils_path = PROJECT_ROOT / 'src' / 'utils'

print('PROJECT_ROOT:', PROJECT_ROOT)
print('MODEL_READY_DIR:', MODEL_READY_DIR)

# Import PCA utilities
# Use sklearn IncrementalPCA by default for better memory efficiency in constrained environments
# PyTorch PCA can be used on SLURM with proper resources
USE_TORCH_PCA = False  # Set to True to use PyTorch PCA (requires more memory)

if utils_path.exists():
    sys.path.insert(0, str(utils_path))
if USE_TORCH_PCA:
    try:
        from pca_utils import IncrementalTorchPCA
        IncrementalPCA = IncrementalTorchPCA  # Alias for compatibility
        IS_TORCH_PCA = True
        print("‚úÖ Using PyTorch PCA (GPU-friendly)")
    except ImportError:
        # Fallback to sklearn if PyTorch PCA not available
        from sklearn.decomposition import IncrementalPCA
        IS_TORCH_PCA = False
        print("‚ö†Ô∏è Using sklearn IncrementalPCA (CPU only)")
else:
    # Use sklearn IncrementalPCA by default for memory efficiency
    from sklearn.decomposition import IncrementalPCA
    IS_TORCH_PCA = False
    print("‚úÖ Using sklearn IncrementalPCA (memory-efficient)")

try:
    from model_training_utils import cleanup_memory, memory_usage, check_memory_safe
    print("‚úÖ Memory utilities imported")
except ImportError:
    def cleanup_memory():
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()
        gc.collect()
    def memory_usage():
        try:
            import psutil
            process = psutil.Process(os.getpid())
            mem_gb = process.memory_info().rss / 1024**3
            print(f"üíæ Memory: {mem_gb:.2f} GB (RAM)", end="")
            if torch.cuda.is_available():
                gpu_mem = torch.cuda.memory_allocated() / 1024**3
                gpu_reserved = torch.cuda.memory_reserved() / 1024**3
                print(f" | {gpu_mem:.2f}/{gpu_reserved:.2f} GB (GPU used/reserved)")
            else:
                print()
        except:
            pass
    def check_memory_safe(ram_threshold_gb=0.85, gpu_threshold=0.80):
        try:
            import psutil
            process = psutil.Process(os.getpid())
            ram_gb = process.memory_info().rss / 1024**3
            total_ram = psutil.virtual_memory().total / 1024**3
            ram_ratio = ram_gb / total_ram if total_ram > 0 else 0
            
            gpu_ratio = 0
            if torch.cuda.is_available():
                gpu_used = torch.cuda.memory_allocated() / 1024**3
                gpu_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
                gpu_ratio = gpu_used / gpu_total if gpu_total > 0 else 0
            
            is_safe = ram_ratio < ram_threshold_gb and gpu_ratio < gpu_threshold
            return is_safe, {'ram_gb': ram_gb, 'ram_ratio': ram_ratio, 'gpu_ratio': gpu_ratio}
        except:
            return True, {}
    print("‚ö†Ô∏è Using fallback memory utilities")

print('PROJECT_ROOT:', PROJECT_ROOT)
memory_usage()


Using device: cuda
PROJECT_ROOT: /gpfs/accounts/si670f25_class_root/si670f25_class/santoshd/Kaggle_2
MODEL_READY_DIR: /gpfs/accounts/si670f25_class_root/si670f25_class/santoshd/Kaggle_2/data/model_ready


‚úÖ Using sklearn IncrementalPCA (memory-efficient)
‚úÖ Memory utilities imported
PROJECT_ROOT: /gpfs/accounts/si670f25_class_root/si670f25_class/santoshd/Kaggle_2
üíæ Memory: 0.59 GB (RAM) | 0.00/0.00 GB (GPU used/reserved)


## 2. Data Loading & Feature Extraction


In [2]:
def load_parquet_split(split: str) -> pl.DataFrame:
    """Load a model_ready parquet split with error handling."""
    try:
        path = MODEL_READY_DIR / f'{split}_model_ready.parquet'
        if not path.exists():
            alt = MODEL_READY_DIR / f'{split}_model_ready_reduced.parquet'
            if alt.exists():
                path = alt
            else:
                raise FileNotFoundError(f'Could not find {split} data')
        print(f'Loading {split} from {path}')
        return pl.read_parquet(path)
    except Exception as e:
        print(f"‚ùå Error loading {split}: {e}")
        raise

def split_features_reg_and_all_emb(df: pl.DataFrame):
    """Split features into regular and embedding families."""
    cols = df.columns
    dtypes = df.dtypes
    label = df['label'].to_numpy() if 'label' in cols else None
    
    reg_cols = []
    EMBEDDING_FAMILY_PREFIXES = ['sent_transformer_', 'scibert_', 'specter_', 'specter2_', 'ner_']
    emb_family_to_cols = {p: [] for p in EMBEDDING_FAMILY_PREFIXES}
    
    NUMERIC_DTYPES = {
        pl.Int8, pl.Int16, pl.Int32, pl.Int64,
        pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
        pl.Float32, pl.Float64
    }
    
    for c, dt in zip(cols, dtypes):
        if c in ('id', 'label'):
            continue
        matched = False
        for p in EMBEDDING_FAMILY_PREFIXES:
            if c.startswith(p):
                emb_family_to_cols[p].append(c)
                matched = True
                break
        if not matched and dt in NUMERIC_DTYPES:
            reg_cols.append(c)
    
    X_reg = df.select(reg_cols).to_numpy() if reg_cols else None
    X_emb_families = {}
    for p, clist in emb_family_to_cols.items():
        if clist:
            X_emb_families[p] = df.select(clist).to_numpy()
    
    return X_reg, X_emb_families, label, reg_cols, emb_family_to_cols

# Load data
try:
    train_df = load_parquet_split('train')
    val_df = load_parquet_split('val')
    
    X_reg_train, X_emb_train_fams, y_train, reg_cols, emb_family_to_cols = split_features_reg_and_all_emb(train_df)
    X_reg_val, X_emb_val_fams, y_val, _, _ = split_features_reg_and_all_emb(val_df)
    
    print(f'\nüìä Data Summary:')
    print(f'  Regular features: {len(reg_cols)}')
    for fam, arr in X_emb_train_fams.items():
        print(f'  Embedding {fam}: {arr.shape[1]} dims')
    print(f'  Train samples: {len(y_train)}, Positive: {y_train.sum()}, Negative: {(y_train==0).sum()}')
    print(f'  Val samples: {len(y_val)}, Positive: {y_val.sum()}, Negative: {(y_val==0).sum()}')
    
    del train_df, val_df
    cleanup_memory()
    memory_usage()
except Exception as e:
    print(f"‚ùå Error in data loading: {e}")
    raise


Loading train from /gpfs/accounts/si670f25_class_root/si670f25_class/santoshd/Kaggle_2/data/model_ready/train_model_ready.parquet


Loading val from /gpfs/accounts/si670f25_class_root/si670f25_class/santoshd/Kaggle_2/data/model_ready/val_model_ready.parquet



üìä Data Summary:
  Regular features: 54
  Embedding sent_transformer_: 384 dims
  Embedding scibert_: 768 dims
  Embedding specter2_: 768 dims
  Train samples: 960000, Positive: 65808, Negative: 894192
  Val samples: 120000, Positive: 8075, Negative: 111925


üíæ Memory: 32.72 GB (RAM) | 0.00/0.00 GB (GPU used/reserved)


## 3. Feature Preprocessing: PCA, Scaling, Normalization


In [3]:
# PCA compression per embedding family
PCA_COMPONENTS_PER_FAMILY = {
    'sent_transformer_': 32,
    'scibert_': 32,
    'specter_': 32,
    'specter2_': 32,
    'ner_': 16,
}

def apply_pca_to_embeddings(X_emb_fams: Dict[str, np.ndarray], fit_on_train: bool = True, pca_models: Optional[Dict] = None):
    """Apply IncrementalPCA to each embedding family (GPU-friendly, OOM-resistant)."""
    X_emb_pca_list = []
    new_pca_models = {}
    
    # Aggressive memory cleanup before starting
    cleanup_memory()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    
    # Check memory before processing
    is_safe, mem_info = check_memory_safe(ram_threshold_gb=0.75, gpu_threshold=0.70)
    if not is_safe:
        print(f"‚ö†Ô∏è Memory usage high before PCA: {mem_info}")
        cleanup_memory()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    for fam, X_emb in X_emb_fams.items():
        n_components = PCA_COMPONENTS_PER_FAMILY.get(fam, 32)
        
        try:
            # Cleanup before each family
            cleanup_memory()
            
            if fit_on_train or pca_models is None:
                # Try PyTorch PCA first, fallback to sklearn IncrementalPCA if memory constrained
                try:
                    # Use IncrementalPCA (same as models 1-4) for memory efficiency
                    # Reduce batch size for better memory management
                    # Only pass device parameter if using PyTorch PCA
                    if IS_TORCH_PCA:
                        ipca = IncrementalPCA(n_components=min(n_components, X_emb.shape[1]), batch_size=2000, device=device)
                    else:
                        ipca = IncrementalPCA(n_components=min(n_components, X_emb.shape[1]), batch_size=2000)
                    
                    # Fit on subset for large datasets (OOM protection) - very aggressive for kernel execution
                    # Use smaller subset to avoid kernel OOM
                    max_pca_rows = min(50_000, X_emb.shape[0])  # Reduced from 100k
                    if X_emb.shape[0] > max_pca_rows:
                        print(f"  Fitting PCA on subset ({max_pca_rows}/{X_emb.shape[0]} samples) for {fam}")
                        idx = np.random.choice(X_emb.shape[0], size=max_pca_rows, replace=False)
                        X_emb_subset = X_emb[idx].copy()  # Explicit copy to avoid memory issues
                        del idx  # Free index immediately
                        cleanup_memory()
                        
                        # Fit PCA
                        ipca.fit(X_emb_subset)
                        del X_emb_subset  # Free memory immediately
                        cleanup_memory()
                    else:
                        # For small datasets, still use copy to avoid memory issues
                        X_emb_copy = X_emb.copy() if X_emb.flags['OWNDATA'] == False else X_emb
                        ipca.fit(X_emb_copy)
                        if X_emb_copy is not X_emb:
                            del X_emb_copy
                        cleanup_memory()
                    
                    new_pca_models[fam] = ipca
                except (RuntimeError, MemoryError) as e:
                    if 'out of memory' in str(e).lower() or 'OOM' in str(e).lower() or 'memory' in str(e).lower():
                        print(f"  ‚ö†Ô∏è PyTorch PCA OOM for {fam}, falling back to sklearn IncrementalPCA...")
                        cleanup_memory()
                        if torch.cuda.is_available():
                            torch.cuda.empty_cache()
                        # Use sklearn IncrementalPCA which is truly incremental
                        from sklearn.decomposition import IncrementalPCA as SklearnIncrementalPCA
                        ipca = SklearnIncrementalPCA(n_components=min(n_components, X_emb.shape[1]), batch_size=1000)
                        # Fit incrementally on chunks
                        max_pca_rows = min(10_000, X_emb.shape[0])  # Very small for sklearn
                        if X_emb.shape[0] > max_pca_rows:
                            idx = np.random.choice(X_emb.shape[0], size=max_pca_rows, replace=False)
                            X_emb_subset = X_emb[idx].copy()
                            del idx
                            cleanup_memory()
                            ipca.fit(X_emb_subset)
                            del X_emb_subset
                            cleanup_memory()
                        else:
                            ipca.fit(X_emb)
                            cleanup_memory()
                        new_pca_models[fam] = ipca
                    else:
                        raise
            else:
                ipca = pca_models[fam]
            
            # Transform in smaller chunks for OOM protection
            chunk_size = 5000  # Reduced from 10000
            if X_emb.shape[0] > chunk_size:
                X_emb_pca_chunks = []
                for i in range(0, X_emb.shape[0], chunk_size):
                    chunk = X_emb[i:i+chunk_size].copy()  # Explicit copy
                    chunk_pca = ipca.transform(chunk)
                    X_emb_pca_chunks.append(chunk_pca)
                    del chunk, chunk_pca
                    cleanup_memory()
                    # Periodic GPU cleanup
                    if i % (chunk_size * 5) == 0 and torch.cuda.is_available():
                        torch.cuda.empty_cache()
                
                X_emb_pca = np.vstack(X_emb_pca_chunks)
                del X_emb_pca_chunks
                cleanup_memory()
            else:
                X_emb_copy = X_emb.copy() if X_emb.flags['OWNDATA'] == False else X_emb
                X_emb_pca = ipca.transform(X_emb_copy)
                if X_emb_copy is not X_emb:
                    del X_emb_copy
                cleanup_memory()
            
            X_emb_pca_list.append(X_emb_pca)
            del X_emb_pca
            cleanup_memory()  # Clean up after each family
            
            # Aggressive cleanup between families
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
            
        except RuntimeError as e:
            if 'out of memory' in str(e).lower() or 'OOM' in str(e).upper():
                print(f"‚ùå OOM error processing {fam}, cleaning up and retrying with smaller batch...")
                cleanup_memory()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize()
                
                # Retry with much smaller subset
                if fit_on_train:
                    max_pca_rows = min(20_000, X_emb.shape[0])
                    idx = np.random.choice(X_emb.shape[0], size=max_pca_rows, replace=False)
                    X_emb_subset = X_emb[idx].copy()
                    # Only pass device parameter if using PyTorch PCA
                    if IS_TORCH_PCA:
                        ipca = IncrementalPCA(n_components=min(n_components, X_emb.shape[1]), batch_size=500, device=device)
                    else:
                        ipca = IncrementalPCA(n_components=min(n_components, X_emb.shape[1]), batch_size=500)
                    ipca.fit(X_emb_subset)
                    new_pca_models[fam] = ipca
                    del X_emb_subset, idx
                    cleanup_memory()
                else:
                    ipca = pca_models[fam]
                
                # Transform with much smaller chunks
                chunk_size_small = 1000
                X_emb_pca_chunks = []
                for i in range(0, X_emb.shape[0], chunk_size_small):
                    chunk = X_emb[i:i+chunk_size_small].copy()
                    chunk_pca = ipca.transform(chunk)
                    X_emb_pca_chunks.append(chunk_pca)
                    del chunk, chunk_pca
                    cleanup_memory()
                X_emb_pca = np.vstack(X_emb_pca_chunks)
                del X_emb_pca_chunks
                X_emb_pca_list.append(X_emb_pca)
                cleanup_memory()
            else:
                raise
    
    # Final cleanup
    cleanup_memory()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    if X_emb_pca_list:
        X_emb_combined = np.hstack(X_emb_pca_list)
    else:
        X_emb_combined = None
    
    return X_emb_combined, new_pca_models if fit_on_train else pca_models

# Apply IncrementalPCA to embeddings (same approach as models 1-4)
try:
    # Aggressive cleanup before PCA
    cleanup_memory()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    
    print('\nüìä Applying IncrementalPCA to embedding families...')
    for fam in X_emb_train_fams.keys():
        n_comp = PCA_COMPONENTS_PER_FAMILY.get(fam, 32)
        print(f'  {fam}: {X_emb_train_fams[fam].shape[1]} dims ‚Üí {n_comp} components')
    
    X_emb_train_pca, pca_models_train = apply_pca_to_embeddings(X_emb_train_fams, fit_on_train=True)
    
    # Cleanup between train and val PCA
    cleanup_memory()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    X_emb_val_pca, _ = apply_pca_to_embeddings(X_emb_val_fams, fit_on_train=False, pca_models=pca_models_train)
    
    print(f'\nüìä After IncrementalPCA:')
    print(f'  Train embeddings: {X_emb_train_pca.shape}')
    print(f'  Val embeddings: {X_emb_val_pca.shape}')
    
    # Combine regular + embeddings
    if X_reg_train is not None:
        X_train = np.hstack([X_reg_train, X_emb_train_pca])
        X_val = np.hstack([X_reg_val, X_emb_val_pca])
    else:
        X_train = X_emb_train_pca
        X_val = X_emb_val_pca
    
    print(f'  Combined train: {X_train.shape}')
    print(f'  Combined val: {X_val.shape}')
    
    del X_reg_train, X_reg_val, X_emb_train_fams, X_emb_val_fams, X_emb_train_pca, X_emb_val_pca
    cleanup_memory()
    memory_usage()
except Exception as e:
    print(f"‚ùå Error in PCA: {e}")
    raise



üìä Applying IncrementalPCA to embedding families...
  sent_transformer_: 384 dims ‚Üí 32 components
  scibert_: 768 dims ‚Üí 32 components
  specter2_: 768 dims ‚Üí 32 components


  Fitting PCA on subset (50000/960000 samples) for sent_transformer_


  Fitting PCA on subset (50000/960000 samples) for scibert_


  Fitting PCA on subset (50000/960000 samples) for specter2_



üìä After IncrementalPCA:
  Train embeddings: (960000, 96)
  Val embeddings: (120000, 96)


  Combined train: (960000, 150)
  Combined val: (120000, 150)
üíæ Memory: 34.03 GB (RAM) | 0.00/0.00 GB (GPU used/reserved)


In [4]:
# Feature Scaling and Normalization (OOM-resistant)
USE_ROBUST_SCALER = True  # Set to False for StandardScaler
USE_MINMAX_NORMALIZATION = True  # Apply MinMaxScaler after scaling

try:
    # Check memory before scaling
    is_safe, mem_info = check_memory_safe(ram_threshold_gb=0.85, gpu_threshold=0.80)
    if not is_safe:
        print(f"‚ö†Ô∏è Memory usage high before scaling: {mem_info}")
        cleanup_memory()
    
    if USE_ROBUST_SCALER:
        scaler = RobustScaler()
        print('\nüìä Using RobustScaler (robust to outliers)')
    else:
        scaler = StandardScaler()
        print('\nüìä Using StandardScaler')
    
    # Fit on train (process in chunks if needed for OOM protection)
    chunk_size = 50000
    if X_train.shape[0] > chunk_size:
        print(f'  Fitting scaler on chunks (size={chunk_size}) for OOM protection...')
        scaler.partial_fit(X_train[:chunk_size])
        for i in range(chunk_size, X_train.shape[0], chunk_size):
            scaler.partial_fit(X_train[i:i+chunk_size])
    else:
        scaler.fit(X_train)
    
    # Transform in chunks for OOM protection
    if X_train.shape[0] > chunk_size:
        X_train_scaled_chunks = []
        for i in range(0, X_train.shape[0], chunk_size):
            chunk = scaler.transform(X_train[i:i+chunk_size])
            X_train_scaled_chunks.append(chunk)
            del chunk
            cleanup_memory()
        X_train_scaled = np.vstack(X_train_scaled_chunks)
        del X_train_scaled_chunks
    else:
        X_train_scaled = scaler.transform(X_train)
    
    X_val_scaled = scaler.transform(X_val)
    
    if USE_MINMAX_NORMALIZATION:
        print('üìä Applying MinMaxScaler normalization')
        minmax_scaler = MinMaxScaler()
        minmax_scaler.fit(X_train_scaled)
        X_train_scaled = minmax_scaler.transform(X_train_scaled)
        X_val_scaled = minmax_scaler.transform(X_val_scaled)
    
    # Replace original with scaled
    X_train = X_train_scaled
    X_val = X_val_scaled
    
    del X_train_scaled, X_val_scaled
    cleanup_memory()
    
    print(f'‚úÖ Scaling complete. Train: {X_train.shape}, Val: {X_val.shape}')
    print(f'  Train range: [{X_train.min():.3f}, {X_train.max():.3f}]')
    print(f'  Val range: [{X_val.min():.3f}, {X_val.max():.3f}]')
    memory_usage()
except RuntimeError as e:
    if 'out of memory' in str(e).lower() or 'OOM' in str(e).upper():
        print(f"‚ùå OOM error in scaling: {e}")
        print("‚ö†Ô∏è Continuing without scaling...")
        cleanup_memory()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    else:
        raise
except Exception as e:
    print(f"‚ùå Error in scaling: {e}")
    print("‚ö†Ô∏è Continuing without scaling...")



üìä Using RobustScaler (robust to outliers)
  Fitting scaler on chunks (size=50000) for OOM protection...
‚ùå Error in scaling: 'RobustScaler' object has no attribute 'partial_fit'
‚ö†Ô∏è Continuing without scaling...


## 4. Cross-Validation & Hyperparameter Tuning


In [5]:
# Combine train and val for CV
X_full = np.vstack([X_train, X_val])
y_full = np.hstack([y_train, y_val])

print(f'\nüìä Full dataset for CV: {X_full.shape}, labels: {y_full.shape}')
print(f'  Positive samples: {y_full.sum()}, Negative: {(y_full==0).sum()}')

# Setup 5-fold CV
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

# XGBoost hyperparameter grid (comprehensive)
XGB_PARAM_GRID = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2.0],
    'scale_pos_weight': [1, (y_full == 0).sum() / max((y_full == 1).sum(), 1)]
}

# For faster tuning, use RandomizedSearchCV
USE_RANDOMIZED_SEARCH = True
N_ITER_RANDOM = 50

print(f'\nüîç Hyperparameter tuning:')
print(f'  Method: {"RandomizedSearchCV" if USE_RANDOMIZED_SEARCH else "GridSearchCV"}')
print(f'  CV folds: {N_FOLDS}')
if USE_RANDOMIZED_SEARCH:
    print(f'  Random iterations: {N_ITER_RANDOM}')

cleanup_memory()
memory_usage()



üìä Full dataset for CV: (1080000, 150), labels: (1080000,)
  Positive samples: 73883, Negative: 1006117

üîç Hyperparameter tuning:
  Method: RandomizedSearchCV
  CV folds: 5
  Random iterations: 50
üíæ Memory: 35.23 GB (RAM) | 0.00/0.00 GB (GPU used/reserved)


In [None]:
# Hyperparameter tuning with error handling
best_model = None
best_params = None
best_cv_score = 0.0

try:
    print('\n' + '='*80)
    print('Starting Hyperparameter Tuning')
    print('='*80)
    
    # Base XGBoost model
    base_model = XGBClassifier(
        random_state=SEED,
        n_jobs=-1,
        tree_method='hist',
        eval_metric='logloss',
    )
    
    if USE_RANDOMIZED_SEARCH:
        search = RandomizedSearchCV(
            base_model,
            XGB_PARAM_GRID,
            cv=skf,
            scoring='f1',
            n_iter=N_ITER_RANDOM,
            random_state=SEED,
            n_jobs=-1,
            verbose=1
        )
    else:
        search = GridSearchCV(
            base_model,
            XGB_PARAM_GRID,
            cv=skf,
            scoring='f1',
            n_jobs=-1,
            verbose=1
        )
    
    # Fit with error handling
    start_time = time.time()
    search.fit(X_full, y_full)
    elapsed_time = time.time() - start_time
    
    best_model = search.best_estimator_
    best_params = search.best_params_
    best_cv_score = search.best_score_
    
    print(f'\n‚úÖ Hyperparameter tuning complete ({elapsed_time/60:.1f} min)')
    print(f'  Best CV F1: {best_cv_score:.4f}')
    print(f'  Best parameters:')
    for key, value in best_params.items():
        print(f'    {key}: {value}')
    
    cleanup_memory()
    memory_usage()
    
except Exception as e:
    print(f"‚ùå Error in hyperparameter tuning: {e}")
    print("‚ö†Ô∏è Using default parameters...")
    # Fallback to default model
    best_model = XGBClassifier(
        random_state=SEED,
        n_jobs=-1,
        tree_method='hist',
        eval_metric='logloss',
        
        scale_pos_weight=(y_full == 0).sum() / max((y_full == 1).sum(), 1)
    )
    best_params = {}
    best_cv_score = 0.0


## 5. Threshold Tuning & Final Evaluation


In [None]:
# Train final model on full data
try:
    print('\n' + '='*80)
    print('Training Final Model on Full Dataset')
    print('='*80)
    
    # Use best parameters or defaults
    final_model = XGBClassifier(**best_params, random_state=SEED, n_jobs=-1, tree_method='hist', eval_metric='logloss')
    
    final_model.fit(X_full, y_full)
    
    # Get predictions on validation set (original split)
    y_val_proba = final_model.predict_proba(X_val)[:, 1]
    
    # Find optimal threshold
    thresholds = np.linspace(0.1, 0.9, 17)
    best_threshold = 0.5
    best_f1 = 0.0
    
    for thr in thresholds:
        y_pred = (y_val_proba >= thr).astype(int)
        f1 = f1_score(y_val, y_pred, pos_label=1, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = thr
    
    print(f'\n‚úÖ Final Optimal Threshold: {best_threshold:.4f}')
    print(f'‚úÖ Final Validation F1: {best_f1:.4f}')
    
    # Classification report
    y_val_pred = (y_val_proba >= best_threshold).astype(int)
    print('\nüìä Classification Report:')
    print(classification_report(y_val, y_val_pred, digits=4, zero_division=0))
    
    cleanup_memory()
    memory_usage()
    
except Exception as e:
    print(f"‚ùå Error in final training: {e}")
    raise


## 6. Save Model


In [None]:
# Save model
try:
    model_save_path = MODEL_SAVE_DIR / 'model5_xgboost_all_features_best.pkl'
    
    save_dict = {
        'model': final_model,
        'scaler': scaler if 'scaler' in locals() else None,
        'minmax_scaler': minmax_scaler if 'minmax_scaler' in locals() else None,
        'pca_models': pca_models_train if 'pca_models_train' in locals() else None,
        'best_params': best_params,
        'best_cv_score': best_cv_score,
        'best_threshold': best_threshold,
        'best_f1': best_f1,
        'reg_cols': reg_cols,
        'emb_family_to_cols': emb_family_to_cols
    }
    
    with open(model_save_path, 'wb') as f:
        pickle.dump(save_dict, f)
    
    print(f'\nüíæ Model saved to: {model_save_path}')
    
except Exception as e:
    print(f"‚ùå Error saving model: {e}")


## 7. Generate Submission


In [None]:
# Load test data and generate predictions
try:
    print('\n' + '='*80)
    print('Generating Test Predictions')
    print('='*80)
    
    test_df = load_parquet_split('test')
    test_ids = test_df['id'].to_numpy()
    
    # Process test data same as train
    X_reg_test, X_emb_test_fams, _, _, _ = split_features_reg_and_all_emb(test_df)
    del test_df
    
    # Apply PCA
    X_emb_test_pca, _ = apply_pca_to_embeddings(X_emb_test_fams, fit_on_train=False, pca_models=pca_models_train)
    
    # Combine
    if X_reg_test is not None:
        X_test = np.hstack([X_reg_test, X_emb_test_pca])
    else:
        X_test = X_emb_test_pca
    
    del X_reg_test, X_emb_test_fams, X_emb_test_pca
    cleanup_memory()
    
    # Scale
    if 'scaler' in locals():
        X_test = scaler.transform(X_test)
        if 'minmax_scaler' in locals():
            X_test = minmax_scaler.transform(X_test)
    
    # Predict in chunks for OOM protection
    chunk_size = 10000
    if X_test.shape[0] > chunk_size:
        print(f'  Predicting in chunks (size={chunk_size}) for OOM protection...')
        y_test_proba_chunks = []
        for i in range(0, X_test.shape[0], chunk_size):
            chunk_proba = final_model.predict_proba(X_test[i:i+chunk_size])[:, 1]
            y_test_proba_chunks.append(chunk_proba)
            del chunk_proba
            cleanup_memory()
        y_test_proba = np.concatenate(y_test_proba_chunks)
        del y_test_proba_chunks
    else:
        y_test_proba = final_model.predict_proba(X_test)[:, 1]
    
    y_test_pred = (y_test_proba >= best_threshold).astype(int)
    
    # Create submission using Polars (not pandas)
    submission_df = pl.DataFrame({
        'id': test_ids,
        'label': y_test_pred
    })
    
    submission_path = SUBMISSION_DIR / 'submission_model5.csv'
    submission_df.write_csv(submission_path)
    
    print(f'\n‚úÖ Submission saved to: {submission_path}')
    print(f'  Test predictions: {len(y_test_pred)}, Positive: {y_test_pred.sum()}, Negative: {(y_test_pred==0).sum()}')
    
    cleanup_memory()
    memory_usage()
    
except Exception as e:
    print(f"‚ùå Error generating submission: {e}")
    raise
