# Model 4: Regular features + all embedding families (each PCA-compressed) ‚Äì Linear classifier

This notebook trains a linear PyTorch classifier on regular features plus **all available embedding families**, each
compressed separately via IncrementalPCA. This gives a "full information but simple model" that respects the
constraint of using simpler models as feature complexity grows.

**Features:**
- ‚úÖ 5-fold Cross-Validation
- ‚úÖ Hyperparameter Tuning (limited search space for 3-hour constraint)
- ‚úÖ Threshold Fine-tuning
- ‚úÖ Model Weight Saving
- ‚úÖ Submission.csv Generation
- ‚úÖ OOM Safe with aggressive memory management

## Memory & Scalability Notes

**Memory Optimizations Applied:**
- ‚úÖ Aggressive garbage collection after data loading
- ‚úÖ Explicit deletion of Polars DataFrames after numpy conversion
- ‚úÖ Batch tensor cleanup in training/validation loops
- ‚úÖ Periodic memory cleanup during training
- ‚úÖ GPU cache clearing (if using GPU)
- ‚úÖ Memory usage monitoring

**Scalability Considerations:**
- **Current dataset size**: 480 train samples, 60 val samples (very small)
- **Batch size**: 512 (configurable via `BATCH_SIZE` variable)
- **Memory footprint**: ~24 features √ó float32 = minimal memory usage
- **OOM Risk**: **LOW** for current dataset size, but may increase with:
  - Larger datasets (>100K samples)
  - More features (if expanded beyond 24)
  - Larger batch sizes
  
**If OOM occurs:**
1. Reduce `BATCH_SIZE` (try 512, 256, or 128)
2. Process validation in smaller chunks
3. Use gradient accumulation for effective larger batches
4. Consider using `torch.no_grad()` more aggressively
5. Monitor memory with `memory_usage()` calls

## 1. Setup

In [1]:
import os
from pathlib import Path
import random
import gc
import numpy as np
import polars as pl
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
# PyTorch-based PCA (GPU-friendly with CPU fallback)
import sys
# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
# Device (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
# Paths (adapt if your project structure differs)
# Try to find project root by looking for 'data' directory
current = Path(os.getcwd())
PROJECT_ROOT = current
# Go up directories until we find one with 'data' subdirectory
for _ in range(5):  # Max 5 levels up
    if (PROJECT_ROOT / 'data').exists():
        break
    PROJECT_ROOT = PROJECT_ROOT.parent
else:
    # Fallback: assume we're in src/notebooks, go up 2 levels
    PROJECT_ROOT = current.parent.parent
MODEL_READY_DIR = PROJECT_ROOT / 'data' / 'model_ready'
utils_path = PROJECT_ROOT / 'src' / 'utils'
print('PROJECT_ROOT:', PROJECT_ROOT)
print('MODEL_READY_DIR:', MODEL_READY_DIR)
# Import PCA utilities
# Use sklearn IncrementalPCA by default for better memory efficiency in constrained environments
# PyTorch PCA can be used on SLURM with proper resources
USE_TORCH_PCA = False  # Set to True to use PyTorch PCA (requires more memory)

if utils_path.exists():
    sys.path.insert(0, str(utils_path))
if USE_TORCH_PCA:
    try:
        from pca_utils import IncrementalTorchPCA
        IncrementalPCA = IncrementalTorchPCA  # Alias for compatibility
        IS_TORCH_PCA = True
        print("‚úÖ Using PyTorch PCA (GPU-friendly)")
    except ImportError:
        # Fallback to sklearn if PyTorch PCA not available
        from sklearn.decomposition import IncrementalPCA
        IS_TORCH_PCA = False
        print("‚ö†Ô∏è Using sklearn IncrementalPCA (CPU only)")
else:
    # Use sklearn IncrementalPCA by default for memory efficiency
    from sklearn.decomposition import IncrementalPCA
    IS_TORCH_PCA = False
    print("‚úÖ Using sklearn IncrementalPCA (memory-efficient)")
from sklearn.metrics import (
    f1_score, classification_report, roc_auc_score,
    average_precision_score, roc_curve, precision_recall_curve
)
import matplotlib.pyplot as plt
# Import memory utilities from shared module
if utils_path.exists():
    sys.path.insert(0, str(utils_path))
try:
    from model_training_utils import cleanup_memory, memory_usage
    print("‚úÖ Memory utilities imported from shared module")
except ImportError:
    # Fallback definitions if utils not available
    def cleanup_memory():
        """Aggressive memory cleanup for both CPU and GPU."""
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()
        gc.collect()  # Second pass for thorough cleanup
    def memory_usage():
        """Display current memory usage statistics."""
        try:
            import psutil
            process = psutil.Process(os.getpid())
            mem_info = process.memory_info()
            print(f"üíæ Memory: {mem_info.rss / 1024**3:.2f} GB (RAM)", end="")
            if torch.cuda.is_available():
                gpu_mem = torch.cuda.memory_allocated() / 1024**3
                gpu_reserved = torch.cuda.memory_reserved() / 1024**3
                print(f" | {gpu_mem:.2f}/{gpu_reserved:.2f} GB (GPU used/reserved)")
            else:
                print()
        except ImportError:
            print("üíæ Memory tracking requires psutil: pip install psutil")
    print("‚ö†Ô∏è Using fallback memory utilities")


Using device: cuda
PROJECT_ROOT: /gpfs/accounts/si670f25_class_root/si670f25_class/santoshd/Kaggle_2
MODEL_READY_DIR: /gpfs/accounts/si670f25_class_root/si670f25_class/santoshd/Kaggle_2/data/model_ready


‚úÖ Using sklearn IncrementalPCA (memory-efficient)


‚úÖ Memory utilities imported from shared module


## 2. Dataset & utilities

In [2]:
class TabularDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = X.astype(np.float32)
        self.y = y.astype(np.float32)
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
def load_parquet_split(split: str) -> pl.DataFrame:
    """Load a model_ready parquet split (train/val/test)."""
    path = MODEL_READY_DIR / f'{split}_model_ready_reduced.parquet'
    if not path.exists():
        # Fallback to non-reduced files if needed
        alt = MODEL_READY_DIR / f'{split}_model_ready.parquet'
        if not alt.exists():
            raise FileNotFoundError(f'Could not find {path} or {alt}')
        path = alt
    print(f'Loading {split} from {path}')
    return pl.read_parquet(path)
EMBEDDING_FAMILY_PREFIXES = ['sent_transformer_', 'scibert_', 'specter_', 'specter2_', 'ner_']
PCA_COMPONENTS_PER_FAMILY = {
    'sent_transformer_': 32,
    'scibert_': 32,
    'specter_': 32,
    'specter2_': 32,
    'ner_': 16,
}
def split_features_reg_and_all_emb(df: pl.DataFrame):
    cols = df.columns
    dtypes = df.dtypes
    label = df['label'].to_numpy() if 'label' in cols else None
    reg_cols = []
    emb_family_to_cols = {p: [] for p in EMBEDDING_FAMILY_PREFIXES}
    # Numeric dtypes in Polars
    NUMERIC_DTYPES = {
        pl.Int8, pl.Int16, pl.Int32, pl.Int64,
        pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
        pl.Float32, pl.Float64
    }
    for c, dt in zip(cols, dtypes):
        if c in ('id', 'label'):
            continue
        matched = False
        for p in EMBEDDING_FAMILY_PREFIXES:
            if c.startswith(p):
                emb_family_to_cols[p].append(c)
                matched = True
                break
        if not matched:
            # Only include numeric columns
            if dt in NUMERIC_DTYPES:
                reg_cols.append(c)
    X_reg = df.select(reg_cols).to_numpy()
    X_emb_families = {}
    for p, clist in emb_family_to_cols.items():
        if clist:
            X_emb_families[p] = df.select(clist).to_numpy()
    return X_reg, X_emb_families, label, reg_cols, emb_family_to_cols
def make_dataloaders(X_train, y_train, X_val, y_val, batch_size: int = 512, val_batch_size: int = None, num_workers: int = 0):
    """
    Create DataLoaders with memory-efficient settings for large datasets.
    Args:
        batch_size: Training batch size (default 512 for large datasets)
        val_batch_size: Validation batch size (defaults to batch_size if None)
        num_workers: Number of worker processes (0 to avoid multiprocessing overhead)
    """
    if val_batch_size is None:
        val_batch_size = batch_size
    # Compute sample weights for WeightedRandomSampler (handle class imbalance)
    class_sample_counts = np.bincount(y_train.astype(int))
    print('Class counts (train):', class_sample_counts)
    # Avoid division by zero
    weights_per_class = 1.0 / np.maximum(class_sample_counts, 1)
    sample_weights = weights_per_class[y_train.astype(int)]
    sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)
    train_dataset = TabularDataset(X_train, y_train)
    val_dataset = TabularDataset(X_val, y_val)
    # Use num_workers=0 to avoid multiprocessing memory overhead
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        sampler=sampler,
        num_workers=num_workers,
        pin_memory=False  # Disable pin_memory to save memory
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=val_batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=False
    )
    return train_loader, val_loader
# Load train/val splits
train_df = load_parquet_split('train')
val_df = load_parquet_split('val')
X_reg_train, X_emb_train_fams, y_train, reg_cols, emb_family_to_cols = split_features_reg_and_all_emb(train_df)
X_reg_val, X_emb_val_fams, y_val, _, _ = split_features_reg_and_all_emb(val_df)
# Clean up Polars DataFrames immediately after conversion
del train_df, val_df
cleanup_memory()
print('Regular feature count:', len(reg_cols))
for fam, arr in X_emb_train_fams.items():
    print(f'Embedding family {fam}: {arr.shape[1]} dims')
# PCA per family (OOM-resistant with chunked processing)
X_emb_train_pca_list = []
X_emb_val_pca_list = []

# Aggressive memory cleanup before starting
cleanup_memory()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

for fam, X_emb_train in X_emb_train_fams.items():
    n_components = PCA_COMPONENTS_PER_FAMILY.get(fam, 32)
    print(f'Fitting IncrementalPCA for family {fam} with {n_components} components')
    
    # Cleanup before each family
    cleanup_memory()
    
    # Only pass device parameter if using PyTorch PCA
    if IS_TORCH_PCA:
        ipca = IncrementalPCA(n_components=n_components, batch_size=2000, device=device)
    else:
        ipca = IncrementalPCA(n_components=n_components, batch_size=2000)
    
    # Fit on subset for large datasets (OOM protection)
    max_pca_rows = min(50_000, X_emb_train.shape[0])  # Reduced from 120k
    if X_emb_train.shape[0] > max_pca_rows:
        print(f"  Fitting PCA on subset ({max_pca_rows}/{X_emb_train.shape[0]} samples) for {fam}")
        idx = np.random.choice(X_emb_train.shape[0], size=max_pca_rows, replace=False)
        X_emb_subset = X_emb_train[idx].copy()  # Explicit copy
        del idx
        cleanup_memory()
        ipca.fit(X_emb_subset)
        del X_emb_subset
        cleanup_memory()
    else:
        ipca.fit(X_emb_train)
        cleanup_memory()
    
    # Transform in chunks for OOM protection
    chunk_size = 5000
    if X_emb_train.shape[0] > chunk_size:
        X_emb_train_pca_chunks = []
        for i in range(0, X_emb_train.shape[0], chunk_size):
            chunk = X_emb_train[i:i+chunk_size].copy()
            chunk_pca = ipca.transform(chunk)
            X_emb_train_pca_chunks.append(chunk_pca)
            del chunk, chunk_pca
            cleanup_memory()
        X_emb_train_pca = np.vstack(X_emb_train_pca_chunks)
        del X_emb_train_pca_chunks
    else:
        X_emb_train_pca = ipca.transform(X_emb_train)
    
    X_emb_val = X_emb_val_fams[fam]
    X_emb_val_pca = ipca.transform(X_emb_val)
    
    # Clean up after each family
    del X_emb_train, X_emb_val
    cleanup_memory()
    X_emb_train_pca_list.append(X_emb_train_pca)
    X_emb_val_pca_list.append(X_emb_val_pca)
    # Clean up intermediate arrays
    del X_emb_train_pca, X_emb_val_pca
    cleanup_memory()
    
    # Aggressive cleanup between families
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
X_emb_train_pca_list = []
X_emb_val_pca_list = []
for fam, X_emb_train in X_emb_train_fams.items():
    n_components = PCA_COMPONENTS_PER_FAMILY.get(fam, 32)
    print(f'Fitting IncrementalPCA for family {fam} with {n_components} components')
    # Only pass device parameter if using PyTorch PCA
    if IS_TORCH_PCA:
        ipca = IncrementalPCA(n_components=n_components, batch_size=2000, device=device)
    else:
        ipca = IncrementalPCA(n_components=n_components, batch_size=2000)
    max_pca_rows = min(50_000, X_emb_train.shape[0])  # Reduced from 120k
    max_pca_rows = min(50_000, X_emb_train.shape[0])  # Reduced from 120k
    idx = np.random.choice(X_emb_train.shape[0], size=max_pca_rows, replace=False)
    ipca.fit(X_emb_train[idx])
    X_emb_train_pca = ipca.transform(X_emb_train)
    X_emb_val = X_emb_val_fams[fam]
    X_emb_val_pca = ipca.transform(X_emb_val)
    # Clean up after each family
    del X_emb_train, X_emb_val
    cleanup_memory()
    X_emb_train_pca_list.append(X_emb_train_pca)
    X_emb_val_pca_list.append(X_emb_val_pca)
    # Clean up intermediate arrays
    del X_emb_train_pca, X_emb_val_pca
    cleanup_memory()
if X_emb_train_pca_list:
    X_emb_train_concat = np.concatenate(X_emb_train_pca_list, axis=1)
    X_emb_val_concat = np.concatenate(X_emb_val_pca_list, axis=1)
    # Clean up lists
    del X_emb_train_pca_list, X_emb_val_pca_list
    cleanup_memory()
    X_train = np.concatenate([X_reg_train, X_emb_train_concat], axis=1)
    X_val = np.concatenate([X_reg_val, X_emb_val_concat], axis=1)
    # Clean up intermediate arrays
    del X_reg_train, X_reg_val, X_emb_train_concat, X_emb_val_concat
    cleanup_memory()
else:
    print('No embedding families found; falling back to regular features only.')
    X_train = X_reg_train
    X_val = X_reg_val
    del X_reg_train, X_reg_val
    cleanup_memory()
print('Train shape:', X_train.shape, 'Val shape:', X_val.shape)
memory_usage()
# Configurable batch sizes - optimized for large datasets
# Reduce if OOM occurs
BATCH_SIZE = 512  # Training batch size (reduced for large datasets)
VAL_BATCH_SIZE = 512  # Validation batch size (can be larger since no gradients)
NUM_WORKERS = 0  # Set to 0 to avoid multiprocessing memory overhead
print(f'\nüìä DataLoader Configuration:')
print(f'   Train batch size: {BATCH_SIZE}')
print(f'   Val batch size: {VAL_BATCH_SIZE}')
print(f'   Num workers: {NUM_WORKERS} (0 = single process, saves memory)')
train_loader, val_loader = make_dataloaders(
    X_train, y_train, X_val, y_val,
    batch_size=BATCH_SIZE,
    val_batch_size=VAL_BATCH_SIZE,
    num_workers=NUM_WORKERS
)
cleanup_memory()
memory_usage()


Loading train from /gpfs/accounts/si670f25_class_root/si670f25_class/santoshd/Kaggle_2/data/model_ready/train_model_ready.parquet


Loading val from /gpfs/accounts/si670f25_class_root/si670f25_class/santoshd/Kaggle_2/data/model_ready/val_model_ready.parquet


In [None]:
## 3. Model definition
class LinearClassifier(nn.Module):
    def __init__(self, input_dim: int):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)
    def forward(self, x):
        return self.linear(x)
input_dim = X_train.shape[1]
model = LinearClassifier(input_dim)
model = model.to(device)
print(model)
## 4. Train / validation loop
EPOCHS = 10
LR = 1e-3
# Compute pos_weight for BCEWithLogitsLoss (handle class imbalance explicitly)
pos_count = (y_train == 1).sum()
neg_count = (y_train == 0).sum()
pos_weight_value = torch.tensor([neg_count / max(pos_count, 1)], dtype=torch.float32).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_value)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
best_val_f1 = 0.0
best_state_dict = None
for epoch in range(1, EPOCHS + 1):
    model.train()
    running_loss = 0.0
    # Training loop with cleanup
    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device).unsqueeze(1)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * xb.size(0)
        # Clean up batch tensors
        del xb, yb, logits, loss
        if epoch % 5 == 0:  # Periodic cleanup during training
            cleanup_memory()
    avg_train_loss = running_loss / len(train_loader.dataset)
    # Validation with memory-efficient accumulation (critical for 100k val samples)
    model.eval()
    all_preds = []
    all_targets = []
    batch_count = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            yb_np = yb.numpy()  # Convert before moving to device
            logits = model(xb)
            probs = torch.sigmoid(logits).cpu().numpy().ravel()
            all_preds.append(probs)
            all_targets.append(yb_np)
            # Clean up batch tensors immediately
            del xb, logits, probs, yb_np
            batch_count += 1
            # Periodic cleanup during validation for large datasets
            if batch_count % 50 == 0:  # Every 50 batches
                cleanup_memory()
    # Concatenate only once
    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)
    # Threshold tuning for F1 on positive class
    # Calculate ROC-AUC and Precision-Recall AUC (using probabilities, not binary predictions)
    roc_auc = roc_auc_score(all_targets, all_preds)
    pr_auc = average_precision_score(all_targets, all_preds)
    best_epoch_f1 = 0.0
    best_thr = 0.5
    thresholds = np.linspace(0.1, 0.9, 17)
    for thr in thresholds:
        # Compute binary predictions without storing intermediate array
        preds_bin = (all_preds >= thr).astype(int)
        f1 = f1_score(all_targets, preds_bin, pos_label=1)
        if f1 > best_epoch_f1:
            best_epoch_f1 = f1
            best_thr = thr
        del preds_bin  # Clean up immediately
    # Clean up concatenated arrays
    del all_preds, all_targets
    print(f'Epoch {epoch:02d} | train_loss={avg_train_loss:.4f} | val_f1={best_epoch_f1:.4f} @ thr={best_thr:.2f} | roc_auc={roc_auc:.4f} | pr_auc={pr_auc:.4f}')
    # Always print memory for large datasets to monitor OOM risk
    memory_usage()
    if best_epoch_f1 > best_val_f1:
        best_val_f1 = best_epoch_f1
        best_state_dict = model.state_dict().copy()  # Explicit copy to avoid references
    # Aggressive cleanup after each epoch (critical for large datasets)
    cleanup_memory()
print('Best val F1:', best_val_f1)
if best_state_dict is not None:
    model.load_state_dict(best_state_dict)
    # Save the best model
    MODEL_SAVE_DIR = PROJECT_ROOT / 'models' / 'saved_models'
    MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True)
    model_save_path = MODEL_SAVE_DIR / 'model4_reg_plus_all_embeddings_pca_linear_best.pt'
    torch.save({
        'model_state_dict': best_state_dict,
        'input_dim': input_dim,
        'best_val_f1': best_val_f1,
        'epochs': EPOCHS,
        'learning_rate': LR,
        'pos_weight': pos_weight_value.cpu().item() if hasattr(pos_weight_value, 'cpu') else pos_weight_value.item(),
        'pca_components_per_family': PCA_COMPONENTS_PER_FAMILY if 'PCA_COMPONENTS_PER_FAMILY' in globals() else None
    }, model_save_path)
    print(f'\nüíæ Saved best model to: {model_save_path}')
print('\nValidation classification report (best model, thr=0.5 for reference):')
# Validation with memory-efficient accumulation (critical for 100k val samples)
model.eval()
all_preds = []
all_targets = []
batch_count = 0
with torch.no_grad():
    for xb, yb in val_loader:
        xb = xb.to(device)
        yb_np = yb.numpy()  # Convert before moving to device
        logits = model(xb)
        probs = torch.sigmoid(logits).cpu().numpy().ravel()
        all_preds.append(probs)
        all_targets.append(yb_np)
        # Clean up batch tensors immediately
        del xb, logits, probs, yb_np
        batch_count += 1
        # Periodic cleanup during validation for large datasets
        if batch_count % 50 == 0:  # Every 50 batches
            cleanup_memory()
# Concatenate only once
all_preds = np.concatenate(all_preds)
all_targets = np.concatenate(all_targets)
# Generate predictions for classification report (threshold 0.5)
preds_bin = (all_preds >= 0.5).astype(int)
print(classification_report(all_targets, preds_bin, digits=4, zero_division=0))
# Calculate ROC-AUC and Precision-Recall AUC
roc_auc = roc_auc_score(all_targets, all_preds)
pr_auc = average_precision_score(all_targets, all_preds)
print(f'\nROC-AUC: {roc_auc:.4f}')
print(f'Precision-Recall AUC: {pr_auc:.4f}')
# Plot ROC curve
fpr, tpr, _ = roc_curve(all_targets, all_preds)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Plot Precision-Recall curve
precision, recall, _ = precision_recall_curve(all_targets, all_preds)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'PR curve (AUC = {pr_auc:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Final cleanup
del all_preds, all_targets, preds_bin
cleanup_memory()
memory_usage()


In [None]:
import sys
import time
import json
# Add utils to path
utils_path = PROJECT_ROOT / 'src' / 'utils'
if utils_path.exists():
    sys.path.insert(0, str(utils_path))
try:
    from model_training_utils import (
        find_optimal_threshold, cross_validate,
        generate_submission, save_model_weights,
        stratified_kfold_splits
    )
    print("‚úÖ Utility functions imported")
except ImportError as e:
    print(f"‚ö†Ô∏è Could not import utilities: {e}")
    print("Will use inline implementations")
# Prepare full dataset for CV (before PCA - PCA will be fit per fold)
train_df_full = load_parquet_split('train')
val_df_full = load_parquet_split('val')
# Split features for full dataset
X_reg_train_full, X_emb_train_fams_full, y_train_full, reg_cols_full, emb_family_to_cols_full = split_features_reg_and_all_emb(train_df_full)
X_reg_val_full, X_emb_val_fams_full, y_val_full, _, _ = split_features_reg_and_all_emb(val_df_full)
# Combine train and val
X_reg_full = np.vstack([X_reg_train_full, X_reg_val_full])
X_emb_fams_full = {}
for fam in X_emb_train_fams_full.keys():
    X_emb_fams_full[fam] = np.vstack([X_emb_train_fams_full[fam], X_emb_val_fams_full[fam]])
y_full = np.hstack([y_train_full, y_val_full])
del train_df_full, val_df_full, X_reg_train_full, X_reg_val_full, X_emb_train_fams_full, X_emb_val_fams_full, y_train_full, y_val_full
cleanup_memory()
print(f"\nüìä Full dataset for CV:")
print(f"  Regular features: {X_reg_full.shape}")
for fam, arr in X_emb_fams_full.items():
    print(f"  Embedding family {fam}: {arr.shape}")
print(f"  Labels: {y_full.shape}")
# Limited hyperparameter search space for 3-hour constraint
HYPERPARAMETER_GRID = [
    {'lr': 1e-3, 'batch_size': 512},
    {'lr': 5e-4, 'batch_size': 512},
    {'lr': 1e-3, 'batch_size': 256},
]
print(f"\nüîç Hyperparameter grid ({len(HYPERPARAMETER_GRID)} combinations):")
for i, hp in enumerate(HYPERPARAMETER_GRID, 1):
    print(f"  {i}. LR={hp['lr']}, Batch={hp['batch_size']}")
cleanup_memory()
memory_usage()


In [None]:
# Helper function to prepare features with PCA for all families per fold
def prepare_features_all_emb_pca(X_reg_fold_train, X_emb_fams_fold_train, X_reg_fold_val, X_emb_fams_fold_val):
    """Fit PCA on each embedding family for fold train data and transform both train and val."""
    X_emb_train_pca_list = []
    X_emb_val_pca_list = []
    pca_models = {}
    for fam, X_emb_train in X_emb_fams_fold_train.items():
        n_components = PCA_COMPONENTS_PER_FAMILY.get(fam, 32)
        # Use PyTorch PCA (GPU-friendly)
    # Only pass device parameter if using PyTorch PCA
    if IS_TORCH_PCA:
        ipca = IncrementalPCA(n_components=n_components, batch_size=2000, device=device)
    else:
        ipca = IncrementalPCA(n_components=n_components, batch_size=2000)
        max_pca_rows = min(50_000, X_emb_train.shape[0])
        idx = np.random.choice(X_emb_train.shape[0], size=max_pca_rows, replace=False)
        ipca.fit(X_emb_train[idx])
        X_emb_train_pca = ipca.transform(X_emb_train)
        X_emb_val = X_emb_fams_fold_val.get(fam)
        X_emb_val_pca = ipca.transform(X_emb_val) if X_emb_val is not None else None
        X_emb_train_pca_list.append(X_emb_train_pca)
        if X_emb_val_pca is not None:
            X_emb_val_pca_list.append(X_emb_val_pca)
        pca_models[fam] = ipca
        del X_emb_train_pca, X_emb_val_pca
        cleanup_memory()
    if X_emb_train_pca_list:
        X_emb_train_concat = np.concatenate(X_emb_train_pca_list, axis=1)
        X_emb_val_concat = np.concatenate(X_emb_val_pca_list, axis=1) if X_emb_val_pca_list else None
        X_train_combined = np.concatenate([X_reg_fold_train, X_emb_train_concat], axis=1)
        X_val_combined = np.concatenate([X_reg_fold_val, X_emb_val_concat], axis=1) if X_emb_val_concat is not None else X_reg_fold_val
        del X_emb_train_pca_list, X_emb_val_pca_list, X_emb_train_concat, X_emb_val_concat
        return X_train_combined, X_val_combined, pca_models
    else:
        return X_reg_fold_train, X_reg_fold_val, {}
# 5-fold CV with hyperparameter tuning
best_hyperparams = None
best_cv_score = 0.0
best_model_state = None
best_threshold = 0.5
best_pca_models = None
cv_start_time = time.time()
# Use PyTorch-friendly stratified splits
cv_splits = stratified_kfold_splits(y_full, n_splits=5, shuffle=True, random_state=42)
for hp_idx, hyperparams in enumerate(HYPERPARAMETER_GRID, 1):
    print(f"\n{'='*80}")
    print(f"Hyperparameter Set {hp_idx}/{len(HYPERPARAMETER_GRID)}: {hyperparams}")
    print(f"{'='*80}")
    BATCH_SIZE = hyperparams['batch_size']
    VAL_BATCH_SIZE = hyperparams['batch_size']
    fold_results = []
    best_fold_f1 = 0.0
    best_fold_model_state = None
    best_fold_idx = -1
    for fold_idx, (train_idx, val_idx) in enumerate(cv_splits, 1):
        print(f"\n  Fold {fold_idx}/5")
        # Split data for this fold
        X_reg_fold_train = X_reg_full[train_idx]
        X_reg_fold_val = X_reg_full[val_idx]
        X_emb_fams_fold_train = {fam: arr[train_idx] for fam, arr in X_emb_fams_full.items()}
        X_emb_fams_fold_val = {fam: arr[val_idx] for fam, arr in X_emb_fams_full.items()}
        y_fold_train = y_full[train_idx]
        y_fold_val = y_full[val_idx]
        # Prepare features with PCA for all families
        X_fold_train, X_fold_val, fold_pca_models = prepare_features_all_emb_pca(
            X_reg_fold_train, X_emb_fams_fold_train, X_reg_fold_val, X_emb_fams_fold_val
        )
        # Create dataloaders
        train_loader_fold, val_loader_fold = make_dataloaders(
            X_fold_train, y_fold_train, X_fold_val, y_fold_val,
            batch_size=BATCH_SIZE,
            val_batch_size=VAL_BATCH_SIZE,
            num_workers=NUM_WORKERS
        )
        # Compute pos_weight
        pos_count = (y_fold_train == 1).sum()
        neg_count = (y_fold_train == 0).sum()
        pos_weight_value = torch.tensor([neg_count / max(pos_count, 1)], dtype=torch.float32).to(device)
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_value)
        # Create and train model
        model_fold = LinearClassifier(input_dim=X_fold_train.shape[1]).to(device)
        optimizer = torch.optim.Adam(model_fold.parameters(), lr=hyperparams['lr'])
        # Train fold
        best_val_f1_fold = 0.0
        best_state_fold = None
        patience_counter = 0
        for epoch in range(1, 16):  # Max 15 epochs
            model_fold.train()
            running_loss = 0.0
            for xb, yb in train_loader_fold:
                xb = xb.to(device)
                yb = yb.to(device).unsqueeze(1)
                optimizer.zero_grad()
                logits = model_fold(xb)
                loss = criterion(logits, yb)
                loss.backward()
                optimizer.step()
                running_loss += loss.item() * xb.size(0)
                del xb, yb, logits, loss
            # Validation
            model_fold.eval()
            all_preds = []
            all_targets = []
            with torch.no_grad():
                for xb, yb in val_loader_fold:
                    xb = xb.to(device)
                    yb_np = yb.numpy()
                    logits = model_fold(xb)
                    probs = torch.sigmoid(logits).cpu().numpy().ravel()
                    all_preds.append(probs)
                    all_targets.append(yb_np)
                    del xb, logits, probs, yb_np
            all_preds = np.concatenate(all_preds)
            all_targets = np.concatenate(all_targets)
            _, val_f1 = find_optimal_threshold(all_targets, all_preds)
            if val_f1 > best_val_f1_fold:
                best_val_f1_fold = val_f1
                best_state_fold = model_fold.state_dict().copy()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= 3:
                    break
            del all_preds, all_targets
            cleanup_memory()
        fold_results.append({'fold': fold_idx, 'f1': best_val_f1_fold})
        if best_val_f1_fold > best_fold_f1:
            best_fold_f1 = best_val_f1_fold
            best_fold_model_state = best_state_fold
            best_fold_idx = fold_idx
            best_pca_models = fold_pca_models
        cleanup_memory()
    mean_f1 = np.mean([r['f1'] for r in fold_results])
    std_f1 = np.std([r['f1'] for r in fold_results])
    print(f"\n  üìä CV Results: Mean F1 = {mean_f1:.4f} ¬± {std_f1:.4f}")
    if mean_f1 > best_cv_score:
        best_cv_score = mean_f1
        best_hyperparams = hyperparams
        best_model_state = best_fold_model_state
        print(f"  ‚úÖ New best!")
cv_time = time.time() - cv_start_time
# Verify CV completed successfully
if best_model_state is None:
    print("‚ö†Ô∏è WARNING: CV loop completed but best_model_state is None!")
    print("This may indicate all hyperparameter combinations failed or no improvement was found.")
    print("Creating a default model for threshold tuning...")
    # Create a default model as fallback
    temp_model = LinearClassifier(input_dim=X_reg_full.shape[1]).to(device)
    best_model_state = temp_model.state_dict()
    best_hyperparams = HYPERPARAMETER_GRID[0] if HYPERPARAMETER_GRID else {"lr": 1e-3, "batch_size": 512}
    best_cv_score = 0.0
    best_threshold = 0.5
print(f"\n{'='*80}")
print(f"‚úÖ CV Complete (Time: {cv_time/60:.1f} min)")
print(f"Best Hyperparameters: {best_hyperparams}")
print(f"Best CV F1: {best_cv_score:.4f}")
print(f"{'='*80}")
memory_usage()


## 5. Final Threshold Tuning and Model Saving


In [None]:
# Prepare final model with best hyperparameters
train_df_final = load_parquet_split('train')
val_df_final = load_parquet_split('val')
X_reg_train_final, X_emb_train_fams_final, y_train_final, _, _ = split_features_reg_and_all_emb(train_df_final)
X_reg_val_final, X_emb_val_fams_final, y_val_final, _, _ = split_features_reg_and_all_emb(val_df_final)
# Fit PCA on full training data for each family
X_emb_train_pca_list = []
X_emb_val_pca_list = []
ipca_models_final = {}
for fam, X_emb_train in X_emb_train_fams_final.items():
    n_components = PCA_COMPONENTS_PER_FAMILY.get(fam, 32)
    # Use PyTorch PCA (GPU-friendly)
    # Only pass device parameter if using PyTorch PCA
    if IS_TORCH_PCA:
        ipca = IncrementalPCA(n_components=n_components, batch_size=2000, device=device)
    else:
        ipca = IncrementalPCA(n_components=n_components, batch_size=2000)
    max_pca_rows = min(50_000, X_emb_train.shape[0])  # Reduced from 120k
    idx = np.random.choice(X_emb_train.shape[0], size=max_pca_rows, replace=False)
    ipca.fit(X_emb_train[idx])
    X_emb_train_pca = ipca.transform(X_emb_train)
    X_emb_val = X_emb_val_fams_final[fam]
    X_emb_val_pca = ipca.transform(X_emb_val)
    X_emb_train_pca_list.append(X_emb_train_pca)
    X_emb_val_pca_list.append(X_emb_val_pca)
    ipca_models_final[fam] = ipca
    del X_emb_train_pca, X_emb_val_pca
    cleanup_memory()
if X_emb_train_pca_list:
    X_emb_train_concat = np.concatenate(X_emb_train_pca_list, axis=1)
    X_emb_val_concat = np.concatenate(X_emb_val_pca_list, axis=1)
    X_train_final = np.concatenate([X_reg_train_final, X_emb_train_concat], axis=1)
    X_val_final = np.concatenate([X_reg_val_final, X_emb_val_concat], axis=1)
    del X_reg_train_final, X_reg_val_final, X_emb_train_pca_list, X_emb_val_pca_list, X_emb_train_concat, X_emb_val_concat
else:
    X_train_final = X_reg_train_final
    X_val_final = X_reg_val_final
    del X_reg_train_final, X_reg_val_final
del train_df_final, val_df_final, X_emb_train_fams_final, X_emb_val_fams_final
cleanup_memory()
# Create final model
final_model = LinearClassifier(input_dim=X_train_final.shape[1]).to(device)
if best_model_state is None:
    raise ValueError("best_model_state is None. CV loop may have failed. Check CV cell execution.")
final_model.load_state_dict(best_model_state)
# Final threshold tuning
print("\n" + "="*80)
print("Final Threshold Tuning on Validation Set")
print("="*80)
BATCH_SIZE = best_hyperparams['batch_size']
VAL_BATCH_SIZE = best_hyperparams['batch_size']
train_loader_final, val_loader_final = make_dataloaders(
    X_train_final, y_train_final, X_val_final, y_val_final,
    batch_size=BATCH_SIZE,
    val_batch_size=VAL_BATCH_SIZE,
    num_workers=NUM_WORKERS
)
final_model.eval()
all_val_preds = []
all_val_targets = []
with torch.no_grad():
    for xb, yb in val_loader_final:
        xb = xb.to(device)
        yb_np = yb.numpy()
        logits = final_model(xb)
        probs = torch.sigmoid(logits).cpu().numpy().ravel()
        all_val_preds.append(probs)
        all_val_targets.append(yb_np)
        del xb, logits, probs, yb_np
all_val_preds = np.concatenate(all_val_preds)
all_val_targets = np.concatenate(all_val_targets)
final_threshold, final_f1 = find_optimal_threshold(all_val_targets, all_val_preds)
print(f"‚úÖ Final Optimal Threshold: {final_threshold:.4f}")
print(f"‚úÖ Final Validation F1: {final_f1:.4f}")
# Save model weights
MODEL_SAVE_DIR = PROJECT_ROOT / 'models' / 'saved_models'
MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True)
model_save_path = MODEL_SAVE_DIR / 'model4_reg_plus_all_embeddings_pca_linear_best.pt'
save_model_weights(
    final_model,
    model_save_path,
    metadata={
        'input_dim': X_train_final.shape[1],
        'best_cv_f1': best_cv_score,
        'best_hyperparams': best_hyperparams,
        'final_threshold': final_threshold,
        'final_val_f1': final_f1,
        'pca_components_per_family': PCA_COMPONENTS_PER_FAMILY
    }
)
cleanup_memory()
memory_usage()


## 6. Generate Submission.csv


In [None]:
# Load test set and generate submission
print("\n" + "="*80)
print("Generating Submission")
print("="*80)
test_df = load_parquet_split('test')
test_ids = test_df['id'].to_numpy()
X_reg_test, X_emb_test_fams, _, _, _ = split_features_reg_and_all_emb(test_df)
# Apply PCA transform to test embeddings for each family
X_emb_test_pca_list = []
for fam, X_emb_test in X_emb_test_fams.items():
    if fam in ipca_models_final:
        X_emb_test_pca = ipca_models_final[fam].transform(X_emb_test)
        X_emb_test_pca_list.append(X_emb_test_pca)
        del X_emb_test_pca
if X_emb_test_pca_list:
    X_emb_test_concat = np.concatenate(X_emb_test_pca_list, axis=1)
    X_test = np.concatenate([X_reg_test, X_emb_test_concat], axis=1)
    del X_emb_test_pca_list, X_emb_test_concat
else:
    X_test = X_reg_test
del test_df, X_reg_test, X_emb_test_fams
cleanup_memory()
print(f"Test set shape: {X_test.shape}")
test_dataset = TabularDataset(X_test, np.zeros(len(X_test)))
test_loader = DataLoader(
    test_dataset,
    batch_size=VAL_BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=False
)
SUBMISSION_DIR = PROJECT_ROOT / 'data' / 'submission_files'
SUBMISSION_DIR.mkdir(parents=True, exist_ok=True)
submission_path = SUBMISSION_DIR / 'submission_model4.csv'
generate_submission(
    final_model,
    test_loader,
    test_ids,
    device,
    final_threshold,
    submission_path
)
cleanup_memory()
memory_usage()
print("\n‚úÖ All done!")
