In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Vulnerable Smart Contracts Detection using ML/DL

This notebook builds a binary classifier for Solidity smart contracts using:
- **CodeBERT embeddings** for contract source code (microsoft/codebert-base)
- **Bytecode-derived tabular features**
- **Multiple MLP variants** with ensemble approach

## Key Improvements:
- ✓ **Embedding Caching**: CodeBERT embeddings cached to `/kaggle/working` for faster re-runs
- ✓ **Reproducibility**: Random seeds set for deterministic behavior
- ✓ **SWA (Stochastic Weight Averaging)**: Proper implementation with DataLoader
- ✓ **Consistent Evaluation**: All models output logits; sigmoid applied only during evaluation
- ✓ **Mini-batch Training**: All training uses DataLoader for scalability

## Usage:
- First run: Computes and caches embeddings
- Subsequent runs: Loads embeddings from cache (much faster!)
- To recompute: Delete `/kaggle/working/codebert_embeddings.npy`

In [None]:
import pandas as pd
secure_df = pd.read_csv("/kaggle/input/bccc-vulscs-2023/BCCC-VolSCs-2023_Secure.csv")
vulnerable_df = pd.read_csv("/kaggle/input/bccc-vulscs-2023/BCCC-VolSCs-2023_Vulnerable.csv")

In [None]:
df = pd.concat([secure_df, vulnerable_df], ignore_index=True)
print(len(df))

In [None]:
contract_codes = []
for hash_id in df['hash_id']:
    file_path = f"/kaggle/input/contractcodes/source/{hash_id}.sol"  
    with open(file_path, 'r', encoding='utf-8') as file:
        contract_codes.append(file.read())

df['contract_code'] = contract_codes

In [None]:
print(df.head())

In [None]:
# Set random seeds for reproducibility
import random
import numpy as np
import torch

def set_seed(seed=42):
    """Set random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"✓ Random seed set to {seed} for reproducibility")

set_seed(42)

In [None]:
import torch
print(torch.__version__)

In [None]:
from torch.utils.data import TensorDataset, DataLoader
import os

def make_loaders(X_train, y_train, X_val, y_val, batch_size=128):
    """Create DataLoaders for training and validation"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Convert to tensors if needed
    if not isinstance(X_train, torch.Tensor):
        X_train = torch.tensor(X_train, dtype=torch.float32)
    if not isinstance(y_train, torch.Tensor):
        y_train = torch.tensor(y_train, dtype=torch.float32)
    if not isinstance(X_val, torch.Tensor):
        X_val = torch.tensor(X_val, dtype=torch.float32)
    if not isinstance(y_val, torch.Tensor):
        y_val = torch.tensor(y_val, dtype=torch.float32)
    
    # Ensure y is 1D for TensorDataset
    if y_train.dim() > 1:
        y_train = y_train.squeeze()
    if y_val.dim() > 1:
        y_val = y_val.squeeze()
    
    # Create datasets
    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
    
    print(f"✓ Created DataLoaders - batch_size={batch_size}, train_batches={len(train_loader)}, val_batches={len(val_loader)}")
    return train_loader, val_loader

def compute_codebert_embeddings(df, code_col="contract_code", cache_dir="/kaggle/working", batch_size=32):
    """Compute or load CodeBERT embeddings with caching"""
    from transformers import RobertaTokenizer, RobertaModel
    from tqdm import tqdm
    
    cache_embeddings = os.path.join(cache_dir, "codebert_embeddings.npy")
    cache_ids = os.path.join(cache_dir, "ids.npy")
    
    # Check cache
    if os.path.exists(cache_embeddings) and os.path.exists(cache_ids):
        print("Checking cache...")
        try:
            embeddings = np.load(cache_embeddings)
            cached_ids = np.load(cache_ids)
            
            # Verify match
            if len(embeddings) == len(df) and np.array_equal(cached_ids, df.index.values):
                print(f"✓ Loaded embeddings from cache: shape={embeddings.shape}, dtype={embeddings.dtype}")
                return embeddings
            else:
                print("Cache mismatch - recomputing...")
        except Exception as e:
            print(f"Cache load error: {e} - recomputing...")
    
    # Compute embeddings
    print("Computing CodeBERT embeddings...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base", use_fast=True)
    model = RobertaModel.from_pretrained("microsoft/codebert-base").to(device)
    model.eval()
    
    def batch_embed(texts):
        inputs = tokenizer(texts, return_tensors="pt", max_length=512, 
                          truncation=True, padding="max_length").to(device)
        with torch.no_grad():
            if device.type == "cuda":
                with torch.cuda.amp.autocast(dtype=torch.float16):
                    outputs = model(**inputs)
            else:
                outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy().astype(np.float32)
    
    # Process in batches
    all_embeddings = []
    contracts = df[code_col].tolist()
    
    for i in tqdm(range(0, len(contracts), batch_size), desc="Embedding batches"):
        batch = contracts[i:i + batch_size]
        all_embeddings.extend(batch_embed(batch))
    
    embeddings = np.array(all_embeddings, dtype=np.float32)
    
    # Save cache
    os.makedirs(cache_dir, exist_ok=True)
    np.save(cache_embeddings, embeddings)
    np.save(cache_ids, df.index.values)
    print(f"✓ Saved to cache: shape={embeddings.shape}, dtype={embeddings.dtype}")
    
    return embeddings

In [None]:
# Compute CodeBERT embeddings with caching
EMBEDDING_BATCH_SIZE = 32  # Adjust based on GPU memory

embeddings_array = compute_codebert_embeddings(
    df, 
    code_col="contract_code",
    cache_dir="/kaggle/working",
    batch_size=EMBEDDING_BATCH_SIZE
)

df["code_embedding"] = list(embeddings_array)
print(f"\nEmbeddings shape: {embeddings_array.shape}")
print(f"Embeddings dtype: {embeddings_array.dtype}")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Select bytecode features
bytecode_features = df[["Weight bytecode_character_6", "Weight bytecode_character_0", 
                         "Weight bytecode_character_8", "Weight bytecode_character_4", 
                         "Weight bytecode_character_5", "Weight bytecode_character_2"]]

# Standardize
scaler = StandardScaler()
bytecode_features_scaled = scaler.fit_transform(bytecode_features)

# Combine embeddings + bytecode
X = np.hstack([np.stack(df["code_embedding"].values), bytecode_features_scaled])
y = df["label"].values

print(f"Feature matrix: {X.shape}")
print(f"Labels: {y.shape}, distribution: {np.unique(y, return_counts=True)}")

# Single train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n✓ Train: {X_train.shape[0]} samples, Val: {X_val.shape[0]} samples")

In [None]:
import torch.nn as nn
import torch.optim as optim
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class CodeBERTClassifier(nn.Module):
    def __init__(self, input_dim):
        super(CodeBERTClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 2048)
        self.bn1 = nn.BatchNorm1d(2048)
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(2048, 1024)
        self.bn2 = nn.BatchNorm1d(1024)
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(1024, 512)
        self.bn3 = nn.BatchNorm1d(512)
        self.dropout3 = nn.Dropout(0.3)
        self.fc4 = nn.Linear(512, 256)
        self.bn4 = nn.BatchNorm1d(256)
        self.dropout4 = nn.Dropout(0.3)
        self.fc5 = nn.Linear(256, 1)

    def forward(self, x):
        x = self.dropout1(self.bn1(torch.relu(self.fc1(x))))
        x = self.dropout2(self.bn2(torch.relu(self.fc2(x))))
        x = self.dropout3(self.bn3(torch.relu(self.fc3(x))))
        x = self.dropout4(self.bn4(torch.relu(self.fc4(x))))
        x = self.fc5(x)  # Return logits (no sigmoid)
        return x

# Create model
model = CodeBERTClassifier(input_dim=X_train.shape[1]).to(device)

# Use BCEWithLogitsLoss (combines sigmoid + BCE for numerical stability)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-6)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=150, gamma=0.001)

# Create DataLoaders
train_loader, val_loader = make_loaders(X_train, y_train, X_val, y_val, batch_size=128)

# Training loop
num_epochs = 250

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    
    for batch_X, batch_y in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.unsqueeze(1).to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        train_loss += loss.item()
    
    scheduler.step()
    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.unsqueeze(1).to(device)
            
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()
    
    val_loss /= len(val_loader)
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

print("\n✓ Training complete!")

In [None]:
# Evaluation - apply sigmoid to get probabilities
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Get predictions for validation set
all_logits = []
all_labels = []

with torch.no_grad():
    for batch_X, batch_y in val_loader:
        batch_X = batch_X.to(device)
        logits = model(batch_X)
        all_logits.append(logits.cpu())
        all_labels.append(batch_y.cpu())

# Concatenate all batches
val_logits = torch.cat(all_logits, dim=0)
val_labels = torch.cat(all_labels, dim=0)

# Apply sigmoid to get probabilities
val_probs = torch.sigmoid(val_logits).numpy()
val_preds = (val_probs > 0.5).astype(float)

# Calculate metrics
accuracy = (val_preds.squeeze() == val_labels.numpy()).mean()
precision = precision_score(val_labels.numpy(), val_preds)
recall = recall_score(val_labels.numpy(), val_preds)
f1 = f1_score(val_labels.numpy(), val_preds)
cm = confusion_matrix(val_labels.numpy(), val_preds)

print("\n=== Validation Metrics ===")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print(f"\nConfusion Matrix:\n{cm}")

# ROC-AUC
fpr, tpr, _ = roc_curve(val_labels.numpy(), val_probs)
auc_score = auc(fpr, tpr)
print(f"\nAUC: {auc_score:.4f}")

In [None]:
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

prob_true, prob_pred = calibration_curve(val_labels.numpy(), val_probs.squeeze(), n_bins=10)
plt.figure(figsize=(8, 6))
plt.plot(prob_pred, prob_true, marker='o', label="CodeBERT Classifier")
plt.plot([0, 1], [0, 1], linestyle="--", label="Perfectly calibrated")
plt.xlabel("Mean predicted value")
plt.ylabel("Fraction of positives")
plt.title("Calibration Curve")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        
    def forward(self, inputs, targets):
        bce_loss = nn.BCEWithLogitsLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * bce_loss
        return focal_loss.mean()

class TemperatureScaling(nn.Module):
    def __init__(self):
        super(TemperatureScaling, self).__init__()
        self.temperature = nn.Parameter(torch.ones(1) * 1.5)
        
    def forward(self, logits):
        return logits / self.temperature

class ResidualBlock(nn.Module):
    def __init__(self, in_features, out_features):
        super(ResidualBlock, self).__init__()
        self.fc = nn.Linear(in_features, out_features)
        self.bn = nn.BatchNorm1d(out_features)
        self.dropout = nn.Dropout(0.3)
        self.adapter = nn.Linear(in_features, out_features)
        
    def forward(self, x):
        identity = x
        out = torch.relu(self.fc(x))
        out = self.bn(out)
        out = self.dropout(out)
        return out + self.adapter(identity)

class ImprovedCodeBERTClassifier(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedCodeBERTClassifier, self).__init__()
        # Initial layer
        self.fc1 = nn.Linear(input_dim, 2048)
        self.bn1 = nn.BatchNorm1d(2048)
        self.dropout1 = nn.Dropout(0.3)
        
        # Residual blocks
        self.res1 = ResidualBlock(2048, 1024)
        self.res2 = ResidualBlock(1024, 512)
        self.res3 = ResidualBlock(512, 256)
        
        # Output layer
        self.fc_out = nn.Linear(256, 1)
        
        # Temperature scaling for calibration
        self.temperature = TemperatureScaling()
        
    def forward(self, x):
        # Initial layer
        x = torch.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.dropout1(x)
        
        # Residual blocks
        x = self.res1(x)
        x = self.res2(x)
        x = self.res3(x)
        
        # Output layer (logits)
        x = self.fc_out(x)
        
        # Apply temperature scaling (still logits)
        x = self.temperature(x)
        
        return x

def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
    
    return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

# Initialize model
model = ImprovedCodeBERTClassifier(input_dim=X_train.shape[1]).to(device)
criterion = FocalLoss(alpha=0.25, gamma=2.0)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=5e-5)

# Scheduler
total_epochs = 350
warmup_steps = 15
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_epochs)

# Create DataLoaders
train_loader, val_loader = make_loaders(X_train, y_train, X_val, y_val, batch_size=128)

# Training with early stopping
best_val_loss = float('inf')
patience = 20
patience_counter = 0
best_model_state = None

for epoch in range(total_epochs):
    # Training
    model.train()
    train_loss = 0.0
    
    for batch_X, batch_y in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.unsqueeze(1).to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        train_loss += loss.item()
    
    scheduler.step()
    train_loss /= len(train_loader)
    
    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.unsqueeze(1).to(device)
            
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()
            
            # Calculate accuracy
            preds = (torch.sigmoid(outputs) > 0.5).float()
            correct += (preds == batch_y).sum().item()
            total += batch_y.size(0)
    
    val_loss /= len(val_loader)
    val_accuracy = correct / total
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{total_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        best_model_state = model.state_dict().copy()
        if (epoch + 1) % 10 == 0:
            print(f"✓ New best model saved!")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

# Load best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print("\n✓ Loaded best model")

# Final evaluation
model.eval()
all_logits = []
all_labels = []

with torch.no_grad():
    for batch_X, batch_y in val_loader:
        batch_X = batch_X.to(device)
        logits = model(batch_X)
        all_logits.append(logits.cpu())
        all_labels.append(batch_y.cpu())

val_logits = torch.cat(all_logits, dim=0)
val_labels = torch.cat(all_labels, dim=0)
val_probs = torch.sigmoid(val_logits).numpy()

print(f"\nFinal Validation Accuracy: {(val_probs.squeeze() > 0.5).astype(float).mean():.4f}")

# Plot calibration
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

prob_true, prob_pred = calibration_curve(val_labels.numpy(), val_probs.squeeze(), n_bins=10)

plt.figure(figsize=(8, 8))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))

ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
ax1.plot(prob_pred, prob_true, "s-", label=f"Improved Model")
ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.set_title("Calibration Curve")
ax1.legend(loc="lower right")

ax2.hist(val_probs.squeeze(), range=(0, 1), bins=10, histtype="step", lw=2)
ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Feature dimensionality reduction
def reduce_dimensions(X_train, X_val, n_components=150):
    print(f"Original feature dimensions: {X_train.shape[1]}")
    
    # Standardize
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # PCA
    pca = PCA(n_components=n_components, random_state=42)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_val_pca = pca.transform(X_val_scaled)
    
    explained_var = np.sum(pca.explained_variance_ratio_)
    print(f"Reduced to {n_components} components, explaining {explained_var:.4f} of variance")
    
    return X_train_pca, X_val_pca, pca, scaler

# Mixup data augmentation
def mixup_data(x, y, alpha=0.2):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)

    mixed_x = lam * x + (1 - lam) * x[index]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

# Base model with regularization (logits output)
class RegularizedModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[512, 256, 128], dropout_rate=0.5):
        super(RegularizedModel, self).__init__()
        
        self.layers = nn.ModuleList()
        
        # Input layer
        self.layers.append(nn.Linear(input_dim, hidden_dims[0]))
        self.layers.append(nn.BatchNorm1d(hidden_dims[0]))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.Dropout(dropout_rate))
        
        # Hidden layers
        for i in range(len(hidden_dims)-1):
            self.layers.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            self.layers.append(nn.BatchNorm1d(hidden_dims[i+1]))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Dropout(dropout_rate))
        
        # Output layer (logits)
        self.output = nn.Linear(hidden_dims[-1], 1)
        
        self.l2_reg = 1e-4
        
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        x = self.output(x)  # Return logits
        
        # L2 regularization
        l2_loss = 0.0
        for param in self.parameters():
            l2_loss += torch.norm(param)
        self.l2_loss = self.l2_reg * l2_loss
        
        return x
    
    def get_l2_loss(self):
        return self.l2_loss

# Different architectures for ensemble
class WideModel(RegularizedModel):
    def __init__(self, input_dim):
        super(WideModel, self).__init__(input_dim, hidden_dims=[1024, 512, 256], dropout_rate=0.5)

class DeepModel(RegularizedModel):
    def __init__(self, input_dim):
        super(DeepModel, self).__init__(input_dim, hidden_dims=[512, 256, 128, 64], dropout_rate=0.4)

class CompactModel(RegularizedModel):
    def __init__(self, input_dim):
        super(CompactModel, self).__init__(input_dim, hidden_dims=[256, 128], dropout_rate=0.3)

# Ensemble wrapper
class EnsembleModel:
    def __init__(self, models):
        self.models = models
        
    def predict(self, x):
        predictions = []
        for model in self.models:
            model.eval()
            with torch.no_grad():
                logits = model(x)
                probs = torch.sigmoid(logits)
                predictions.append(probs)
        
        # Average predictions
        return torch.stack(predictions).mean(dim=0)

# Training function with fixed SWA and DataLoader
def train_model(model, X_train, y_train, X_val, y_val, criterion, optimizer, 
                scheduler=None, epochs=200, batch_size=128, 
                patience=25, use_mixup=True, alpha=0.2, use_swa=True):
    
    # Create DataLoaders (FIXED: was missing before)
    train_loader, val_loader = make_loaders(X_train, y_train, X_val, y_val, batch_size=batch_size)
    
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    # SWA setup
    swa_start = epochs // 2
    swa_model = None
    swa_scheduler = None
    
    if use_swa:
        swa_model = torch.optim.swa_utils.AveragedModel(model)
        swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, swa_lr=0.0005)
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0
        
        for batch_X, batch_y in train_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.unsqueeze(1).to(device)
            
            optimizer.zero_grad()
            
            # Apply mixup if enabled
            if use_mixup and epoch < epochs * 0.8:
                batch_X, targets_a, targets_b, lam = mixup_data(batch_X, batch_y, alpha)
                outputs = model(batch_X)
                loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam)
            else:
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
            
            # Add L2 regularization if available
            if hasattr(model, 'get_l2_loss'):
                loss += model.get_l2_loss()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            epoch_loss += loss.item()
        
        epoch_loss /= len(train_loader)
        
        # Update SWA after swa_start
        if use_swa and epoch >= swa_start:
            swa_model.update_parameters(model)
            swa_scheduler.step()
        elif scheduler is not None:
            scheduler.step()
        
        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X = batch_X.to(device)
                batch_y = batch_y.unsqueeze(1).to(device)
                
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item()
                
                preds = (torch.sigmoid(outputs) > 0.5).float()
                correct += (preds == batch_y).sum().item()
                total += batch_y.size(0)
        
        val_loss /= len(val_loader)
        val_accuracy = correct / total
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = model.state_dict().copy()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break
    
    # Load best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print("✓ Loaded best model")
    
    # Finalize SWA model (FIXED: use train_loader instead of undefined X_train_loader)
    if use_swa and swa_model is not None and epoch >= swa_start:
        print("Updating batch normalization for SWA model...")
        torch.optim.swa_utils.update_bn(train_loader, swa_model)
        
        # Evaluate SWA model
        swa_model.eval()
        swa_correct = 0
        swa_total = 0
        
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_X = batch_X.to(device)
                batch_y = batch_y.unsqueeze(1).to(device)
                
                outputs = swa_model(batch_X)
                preds = (torch.sigmoid(outputs) > 0.5).float()
                swa_correct += (preds == batch_y).sum().item()
                swa_total += batch_y.size(0)
        
        swa_accuracy = swa_correct / swa_total
        print(f"SWA Model Val Accuracy: {swa_accuracy:.4f}")
        
        # Use SWA model if better
        if swa_accuracy > val_accuracy:
            print("✓ Using SWA model (better performance)")
            # Copy SWA parameters to model
            model.load_state_dict(swa_model.module.state_dict())
    
    return model

# Main ensemble training
def run_ensemble_training(X_train, y_train, X_val, y_val):
    # Reduce dimensions
    X_train_reduced, X_val_reduced, pca, scaler = reduce_dimensions(X_train, X_val, n_components=150)
    
    criterion = nn.BCEWithLogitsLoss()
    
    # Train different models for ensemble
    input_dim = X_train_reduced.shape[1]
    models = []
    
    # Model 1: Wide architecture
    print("\n=== Training Wide Model ===")
    model1 = WideModel(input_dim).to(device)
    optimizer1 = optim.AdamW(model1.parameters(), lr=0.001, weight_decay=1e-4)
    scheduler1 = optim.lr_scheduler.CosineAnnealingLR(optimizer1, T_max=100, eta_min=1e-5)
    model1 = train_model(model1, X_train_reduced, y_train, X_val_reduced, y_val, 
                        criterion, optimizer1, scheduler1, epochs=200, batch_size=128, 
                        patience=25, use_mixup=True, alpha=0.2, use_swa=True)
    models.append(model1)
    
    # Model 2: Deep architecture
    print("\n=== Training Deep Model ===")
    model2 = DeepModel(input_dim).to(device)
    optimizer2 = optim.AdamW(model2.parameters(), lr=0.002, weight_decay=1e-5)
    scheduler2 = optim.lr_scheduler.CosineAnnealingLR(optimizer2, T_max=100, eta_min=1e-5)
    model2 = train_model(model2, X_train_reduced, y_train, X_val_reduced, y_val, 
                        criterion, optimizer2, scheduler2, epochs=200, batch_size=64, 
                        patience=25, use_mixup=True, alpha=0.3, use_swa=True)
    models.append(model2)
    
    # Model 3: Compact architecture
    print("\n=== Training Compact Model ===")
    model3 = CompactModel(input_dim).to(device)
    optimizer3 = optim.Adam(model3.parameters(), lr=0.003, weight_decay=1e-6)
    scheduler3 = optim.lr_scheduler.StepLR(optimizer3, step_size=30, gamma=0.5)
    model3 = train_model(model3, X_train_reduced, y_train, X_val_reduced, y_val, 
                        criterion, optimizer3, scheduler3, epochs=200, batch_size=256, 
                        patience=25, use_mixup=False, use_swa=False)
    models.append(model3)
    
    # Create ensemble
    ensemble = EnsembleModel(models)
    
    # Evaluate individual models and ensemble
    X_val_torch = torch.tensor(X_val_reduced, dtype=torch.float32).to(device)
    y_val_torch = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1).to(device)
    
    print("\n=== Individual Model Performance ===")
    for i, model in enumerate(models):
        model.eval()
        with torch.no_grad():
            logits = model(X_val_torch)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).float()
            acc = (preds == y_val_torch).float().mean().item()
        print(f"Model {i+1} Val Accuracy: {acc:.4f}")
    
    # Ensemble evaluation
    ensemble_probs = ensemble.predict(X_val_torch)
    ensemble_preds = (ensemble_probs > 0.5).float()
    ensemble_acc = (ensemble_preds == y_val_torch).float().mean().item()
    
    print(f"\n✓ Ensemble Val Accuracy: {ensemble_acc:.4f}")
    
    # Plot calibration for ensemble
    val_probs_np = ensemble_probs.cpu().numpy()
    
    prob_true, prob_pred = calibration_curve(y_val, val_probs_np.squeeze(), n_bins=10)
    
    plt.figure(figsize=(8, 8))
    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))
    
    ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
    ax1.plot(prob_pred, prob_true, "s-", label=f"Ensemble (Acc: {ensemble_acc:.3f})")
    ax1.set_ylabel("Fraction of positives")
    ax1.set_ylim([-0.05, 1.05])
    ax1.set_title("Ensemble Calibration Curve")
    ax1.legend(loc="lower right")
    
    ax2.hist(val_probs_np.squeeze(), range=(0, 1), bins=10, histtype="step", lw=2)
    ax2.set_xlabel("Mean predicted value")
    ax2.set_ylabel("Count")
    
    plt.tight_layout()
    plt.show()
    
    return ensemble, models, pca, scaler

# Run ensemble training
ensemble, models, pca, scaler = run_ensemble_training(X_train, y_train, X_val, y_val)

print("\n✓ All training complete!")

## Summary

All models trained successfully with the following improvements:

### Fixed Issues:
1. ✅ **SWA update_bn crash**: Now uses proper `train_loader` instead of undefined `X_train_loader`
2. ✅ **Batching**: All training uses DataLoader with mini-batches
3. ✅ **Loss consistency**: All models output logits + use BCEWithLogitsLoss
4. ✅ **Single split**: Removed duplicate test split, using consistent train/val split
5. ✅ **Reproducibility**: Seeds set at the beginning
6. ✅ **Embedding caching**: Embeddings cached to `/kaggle/working` for fast re-runs

### Key Features:
- **CodeBERT Embeddings**: Cached for efficiency
- **DataLoader Training**: Proper mini-batch training for all models
- **Logits + BCEWithLogitsLoss**: Consistent across all models
- **Sigmoid in Evaluation Only**: Applied explicitly when computing metrics
- **SWA Integration**: Properly implemented with DataLoader
- **Ensemble Learning**: Multiple architectures combined for robust predictions

### Metrics Available:
- Accuracy, Precision, Recall, F1-Score
- Confusion Matrix
- ROC-AUC
- Calibration Curves