In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
secure_df = pd.read_csv("/kaggle/input/bccc-vulscs-2023/BCCC-VolSCs-2023_Secure.csv")
vulnerable_df = pd.read_csv("/kaggle/input/bccc-vulscs-2023/BCCC-VolSCs-2023_Vulnerable.csv")

In [None]:
df = pd.concat([secure_df, vulnerable_df], ignore_index=True)
print(len(df))

In [None]:
contract_codes = []
for hash_id in df['hash_id']:
    file_path = f"/kaggle/input/contractcodes/source/{hash_id}.sol"  
    with open(file_path, 'r', encoding='utf-8') as file:
        contract_codes.append(file.read())

df['contract_code'] = contract_codes

In [None]:
print(df.head())

In [None]:
import torch
print(torch.__version__)

In [None]:
from transformers import RobertaTokenizer, RobertaModel
import torch

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")

def get_codebert_embedding(code):
    inputs = tokenizer(code, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  

df['code_embedding'] = df['contract_code'].apply(get_codebert_embedding)


In [None]:
print(torch.cuda.is_available())

In [None]:
import torch
print(torch.cuda.is_available())  # Should be True
print(torch.cuda.device_count())  # Should be >0
print(torch.cuda.get_device_name(0))  # Should print "Tesla P100"

In [None]:
from transformers import RobertaTokenizer, RobertaModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base").to(device)  # Move model to GPU

def get_codebert_embedding(code):
    inputs = tokenizer(code, return_tensors="pt", max_length=512, truncation=True, padding="max_length").to(device)  # Move inputs to GPU
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move output back to CPU

df['code_embedding'] = df['contract_code'].apply(get_codebert_embedding)

In [None]:
from transformers import RobertaTokenizer, RobertaModel
import torch
import numpy as np
from tqdm import tqdm

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model (Optimized)
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base", use_fast=True)
model = RobertaModel.from_pretrained("microsoft/codebert-base").to(device)
model.eval()  # Set to evaluation mode

# Set batch size (Optimized for P100 GPU)
BATCH_SIZE = 32  # Increase if GPU memory allows

def batch_get_codebert_embedding(texts):
    """Processes a batch of contract codes into embeddings."""
    inputs = tokenizer(texts, return_tensors="pt", max_length=512, truncation=True, padding="max_length").to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# Process in batches
all_embeddings = []
contracts = df['contract_code'].tolist()  # Convert DataFrame column to list

for i in tqdm(range(0, len(contracts), BATCH_SIZE), desc="Processing Batches"):
    batch = contracts[i:i + BATCH_SIZE]  # Get batch
    batch_embeddings = batch_get_codebert_embedding(batch)  # Compute embeddings
    all_embeddings.extend(batch_embeddings)  # Store results

# Add embeddings to DataFrame
df['code_embedding'] = list(all_embeddings)


In [None]:
# print(f"Shape of X: {X.shape}")  # Should be (num_samples, embedding_dim + bytecode_features)
# print(f"Shape of y: {y.shape}")  # Should be (num_samples,)
# print(f"Sample y values: {np.unique(y)}")  # Should show [0,1]

print(df.head())  # Check if 'code_embedding' exists
print(df['code_embedding'].dtype)  # Check its data type


In [None]:
print(df['code_embedding'].head())  
print(type(df['code_embedding'].iloc[0]))  

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

bytecode_features = df[['Weight bytecode_character_6', 'Weight bytecode_character_0', 
                         'Weight bytecode_character_8', 'Weight bytecode_character_4', 
                         'Weight bytecode_character_5', 'Weight bytecode_character_2']]

scaler = StandardScaler()
bytecode_features = scaler.fit_transform(bytecode_features)

X = np.hstack([np.stack(df['code_embedding'].values), bytecode_features])
y = df['label'].values

In [None]:
print(f"Shape of X: {X.shape}")  # Should be (num_samples, embedding_dim + bytecode_features)
print(f"Shape of y: {y.shape}")  # Should be (num_samples,)
print(f"Sample y values: {np.unique(y)}")  # Should show [0,1]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_torch = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_torch = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_torch = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1).to(device)

In [None]:
import torch.nn as nn
import torch.optim as optim

class CodeBERTClassifier(nn.Module):
    def __init__(self, input_dim):
        super(CodeBERTClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)  
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

model = CodeBERTClassifier(input_dim=X_train.shape[1])

criterion = nn.BCELoss()  
optimizer = optim.Adam(model.parameters(), lr=0.001)

X_train_torch = torch.tensor(X_train, dtype=torch.float32)
y_train_torch = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)

for epoch in range(10):  
    optimizer.zero_grad()
    outputs = model(X_train_torch)
    loss = criterion(outputs, y_train_torch)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

In [None]:
import torch.nn as nn
import torch.optim as optim
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class CodeBERTClassifier(nn.Module):
    def __init__(self, input_dim):
        super(CodeBERTClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)  
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

model = CodeBERTClassifier(input_dim=X_train.shape[1]).to(device)

criterion = nn.BCELoss()  
optimizer = optim.Adam(model.parameters(), lr=0.001)

X_train_torch = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_torch = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)

for epoch in range(100):  # 10 epochs
    optimizer.zero_grad()
    outputs = model(X_train_torch)  # Forward pass
    loss = criterion(outputs, y_train_torch)  # Compute loss
    loss.backward()  # Backpropagation
    optimizer.step()  # Update weights
    
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

In [None]:
import torch.nn as nn
import torch.optim as optim
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class CodeBERTClassifier(nn.Module):
    def __init__(self, input_dim):
        super(CodeBERTClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 2048)
        self.bn1 = nn.BatchNorm1d(2048)
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(2048, 1024)
        self.bn2 = nn.BatchNorm1d(1024)
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(1024, 512)
        self.bn3 = nn.BatchNorm1d(512)
        self.dropout3 = nn.Dropout(0.3)
        self.fc4 = nn.Linear(512, 256)
        self.bn4 = nn.BatchNorm1d(256)
        self.dropout4 = nn.Dropout(0.3)
        self.fc5 = nn.Linear(256, 1)

    def forward(self, x):
        x = self.dropout1(self.bn1(torch.relu(self.fc1(x))))
        x = self.dropout2(self.bn2(torch.relu(self.fc2(x))))
        x = self.dropout3(self.bn3(torch.relu(self.fc3(x))))
        x = self.dropout4(self.bn4(torch.relu(self.fc4(x))))
        x = self.fc5(x)
        return x

model = CodeBERTClassifier(input_dim=X_train.shape[1]).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-6)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=150, gamma=0.001)

X_train_torch = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_torch = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_torch = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1).to(device)

for epoch in range(250):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_torch)
    loss = criterion(outputs, y_train_torch)
    loss.backward()
    optimizer.step()
    scheduler.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_torch)
        val_loss = criterion(val_outputs, y_val_torch)

    print(f"Epoch {epoch+1}, Loss: {loss.item()}, Val Loss: {val_loss.item()}")


In [None]:
# After training, calculate accuracy on validation set

model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    val_outputs = model(X_val_torch)
    # Apply sigmoid to get probabilities
    val_preds = torch.sigmoid(val_outputs)
    # Convert probabilities to binary predictions (0 or 1)
    val_preds = (val_preds > 0.5).float()  # 1 if >0.5, else 0

    # Calculate accuracy: compare predictions with actual labels
    correct = (val_preds == y_val_torch).sum().item()
    accuracy = correct / len(y_val_torch)

print(f"Validation Accuracy: {accuracy * 100:.2f}%")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Get model predictions (0 or 1)
y_pred = (model(X_val_torch) > 0).cpu().numpy()

precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = (model(X_val_torch) > 0).cpu().numpy()
cm = confusion_matrix(y_val, y_pred)
print(f"Confusion Matrix:\n{cm}")

In [None]:
from sklearn.metrics import roc_curve, auc

# Get the predicted probabilities for class 1 (positive class)
y_prob = torch.sigmoid(model(X_val_torch)).cpu().detach().numpy()
fpr, tpr, _ = roc_curve(y_val, y_prob)
auc_score = auc(fpr, tpr)

print(f"AUC: {auc_score}")

In [None]:
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

prob_true, prob_pred = calibration_curve(y_val, y_prob, n_bins=10)
plt.plot(prob_pred, prob_true, marker='o', label="Calibration Curve")
plt.plot([0, 1], [0, 1], linestyle="--", label="Perfectly calibrated")
plt.xlabel("Mean predicted value")
plt.ylabel("Fraction of positives")
plt.title("Calibration Curve")
plt.legend()
plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        
    def forward(self, inputs, targets):
        bce_loss = nn.BCEWithLogitsLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * bce_loss
        return focal_loss.mean()

class TemperatureScaling(nn.Module):
    def __init__(self):
        super(TemperatureScaling, self).__init__()
        self.temperature = nn.Parameter(torch.ones(1) * 1.5)
        
    def forward(self, logits):
        return logits / self.temperature

class ResidualBlock(nn.Module):
    def __init__(self, in_features, out_features):
        super(ResidualBlock, self).__init__()
        self.fc = nn.Linear(in_features, out_features)
        self.bn = nn.BatchNorm1d(out_features)
        self.dropout = nn.Dropout(0.3)
        self.adapter = nn.Linear(in_features, out_features)
        
    def forward(self, x):
        identity = x
        out = torch.relu(self.fc(x))
        out = self.bn(out)
        out = self.dropout(out)
        return out + self.adapter(identity)

class ImprovedCodeBERTClassifier(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedCodeBERTClassifier, self).__init__()
        # Initial layer
        self.fc1 = nn.Linear(input_dim, 2048)
        self.bn1 = nn.BatchNorm1d(2048)
        self.dropout1 = nn.Dropout(0.3)
        
        # Residual blocks
        self.res1 = ResidualBlock(2048, 1024)
        self.res2 = ResidualBlock(1024, 512)
        self.res3 = ResidualBlock(512, 256)
        
        # Output layer
        self.fc_out = nn.Linear(256, 1)
        
        # Temperature scaling for calibration
        self.temperature = TemperatureScaling()
        
    def forward(self, x):
        # Initial layer
        x = torch.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.dropout1(x)
        
        # Residual blocks
        x = self.res1(x)
        x = self.res2(x)
        x = self.res3(x)
        
        # Output layer
        x = self.fc_out(x)
        
        # Apply temperature scaling
        x = self.temperature(x)
        
        return x

def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
    
    return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

# Initialize model, loss function, optimizer
model = ImprovedCodeBERTClassifier(input_dim=X_train.shape[1]).to(device)
criterion = FocalLoss(alpha=0.25, gamma=2.0)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=5e-5)

# Learning rate scheduler with warmup
total_epochs = 350  # Increased from 250 to give more training time
warmup_steps = 15
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_epochs)

# Prepare data
X_train_torch = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_torch = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_torch = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1).to(device)

# Training loop with early stopping
best_val_loss = float('inf')
patience = 20
patience_counter = 0
best_model_state = None

for epoch in range(total_epochs):
    # Training
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_torch)
    loss = criterion(outputs, y_train_torch)
    loss.backward()
    
    # Gradient clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    
    optimizer.step()
    scheduler.step()
    
    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_torch)
        val_loss = criterion(val_outputs, y_val_torch)
        val_preds = torch.sigmoid(val_outputs) > 0.5
        val_accuracy = (val_preds == y_val_torch).float().mean().item()
    
    print(f"Epoch {epoch+1}/{total_epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}, Val Accuracy: {val_accuracy:.4f}")
    
    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        best_model_state = model.state_dict().copy()
        print(f"New best model saved! Validation Loss: {best_val_loss:.4f}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs")
            break

# Load best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print("Loaded best model based on validation loss")

# Final evaluation
model.eval()
with torch.no_grad():
    train_outputs = model(X_train_torch)
    train_preds = torch.sigmoid(train_outputs) > 0.5
    train_accuracy = (train_preds == y_train_torch).float().mean().item()
    
    val_outputs = model(X_val_torch)
    val_preds = torch.sigmoid(val_outputs) > 0.5
    val_accuracy = (val_preds == y_val_torch).float().mean().item()
    
    # For calibration curve
    val_probs = torch.sigmoid(val_outputs).cpu().numpy()

print(f"\nFinal Results:")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Code to plot new calibration curve
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

def plot_calibration_curve(y_true, y_prob):
    plt.figure(figsize=(8, 8))
    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))
    
    ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
    
    fraction_of_positives, mean_predicted_value = calibration_curve(
        y_true, y_prob, n_bins=10
    )
    
    ax1.plot(
        mean_predicted_value,
        fraction_of_positives,
        "s-",
        label=f"Model (Accuracy: {val_accuracy:.3f})"
    )
    
    ax1.set_ylabel("Fraction of positives")
    ax1.set_ylim([-0.05, 1.05])
    ax1.set_title("Calibration Curve")
    ax1.legend(loc="lower right")
    
    ax2.hist(y_prob, range=(0, 1), bins=10, histtype="step", lw=2)
    ax2.set_xlabel("Mean predicted value")
    ax2.set_ylabel("Count")
    
    plt.tight_layout()
    plt.show()

# Plot the calibration curve for the validation set
plot_calibration_curve(y_val.ravel(), val_probs.ravel())

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Feature dimensionality reduction
def reduce_dimensions(X_train, X_val, n_components=150):
    print(f"Original feature dimensions: {X_train.shape[1]}")
    
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Apply PCA
    pca = PCA(n_components=n_components, random_state=42)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_val_pca = pca.transform(X_val_scaled)
    
    explained_var = np.sum(pca.explained_variance_ratio_)
    print(f"Reduced to {n_components} components, explaining {explained_var:.4f} of variance")
    
    return X_train_pca, X_val_pca, pca, scaler

# Advanced regularization: Mixup
def mixup_data(x, y, alpha=0.2):
    '''Generate mixup samples and targets'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)

    mixed_x = lam * x + (1 - lam) * x[index]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

# Base model with increased regularization
class RegularizedModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[512, 256, 128], dropout_rate=0.5):
        super(RegularizedModel, self).__init__()
        
        self.layers = nn.ModuleList()
        
        # Input layer
        self.layers.append(nn.Linear(input_dim, hidden_dims[0]))
        self.layers.append(nn.BatchNorm1d(hidden_dims[0]))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.Dropout(dropout_rate))
        
        # Hidden layers
        for i in range(len(hidden_dims)-1):
            self.layers.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            self.layers.append(nn.BatchNorm1d(hidden_dims[i+1]))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Dropout(dropout_rate))
        
        # Output layer
        self.output = nn.Linear(hidden_dims[-1], 1)
        
        # L2 regularization applied at forward pass
        self.l2_reg = 1e-4
        
    def forward(self, x):
        # Forward pass through layers
        for layer in self.layers:
            x = layer(x)
        
        # Output layer
        x = self.output(x)
        
        # L2 regularization
        l2_reg = 0.0
        for param in self.parameters():
            l2_reg += torch.norm(param)
        
        # Store for access during training
        self.l2_loss = self.l2_reg * l2_reg
        
        return x
    
    def get_l2_loss(self):
        return self.l2_loss

# Different model architectures for ensemble
class WideModel(RegularizedModel):
    def __init__(self, input_dim):
        super(WideModel, self).__init__(input_dim, hidden_dims=[1024, 512, 256], dropout_rate=0.5)

class DeepModel(RegularizedModel):
    def __init__(self, input_dim):
        super(DeepModel, self).__init__(input_dim, hidden_dims=[512, 256, 128, 64], dropout_rate=0.4)

class CompactModel(RegularizedModel):
    def __init__(self, input_dim):
        super(CompactModel, self).__init__(input_dim, hidden_dims=[256, 128], dropout_rate=0.3)

# Ensemble model wrapper
class EnsembleModel:
    def __init__(self, models):
        self.models = models
        
    def predict(self, x):
        predictions = []
        for model in self.models:
            model.eval()
            with torch.no_grad():
                pred = torch.sigmoid(model(x))
                predictions.append(pred)
        
        # Average predictions
        ensemble_pred = torch.stack(predictions).mean(dim=0)
        return ensemble_pred

# Training function with SWA and Mixup
def train_model(model, X_train, y_train, X_val, y_val, criterion, optimizer, 
                scheduler=None, epochs=200, batch_size=128, 
                patience=25, use_mixup=True, alpha=0.2):
    
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    # Convert data to PyTorch tensors if not already
    if not isinstance(X_train, torch.Tensor):
        X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
    if not isinstance(y_train, torch.Tensor):
        y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)
    if not isinstance(X_val, torch.Tensor):
        X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
    if not isinstance(y_val, torch.Tensor):
        y_val = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1).to(device)
    
    # For SWA (Stochastic Weight Averaging)
    swa_start = epochs // 2
    swa_model = torch.optim.swa_utils.AveragedModel(model)
    swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, swa_lr=0.0005)
    
    train_losses = []
    val_losses = []
    val_accuracies = []
    
    # Calculate number of batches
    num_train_samples = X_train.shape[0]
    num_batches = (num_train_samples + batch_size - 1) // batch_size
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0
        
        # Create random permutation of indices for batching
        indices = torch.randperm(num_train_samples).to(device)
        
        for i in range(num_batches):
            # Get batch indices
            start_idx = i * batch_size
            end_idx = min((i + 1) * batch_size, num_train_samples)
            batch_indices = indices[start_idx:end_idx]
            
            # Extract batch data
            batch_x = X_train[batch_indices]
            batch_y = y_train[batch_indices]
            
            optimizer.zero_grad()
            
            # Apply mixup if enabled
            if use_mixup and epoch < epochs * 0.8:  # Use mixup for first 80% of training
                batch_x, targets_a, targets_b, lam = mixup_data(batch_x, batch_y, alpha)
                outputs = model(batch_x)
                loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam)
            else:
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
            
            # Add L2 regularization loss if available
            if hasattr(model, 'get_l2_loss'):
                loss += model.get_l2_loss()
            
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            epoch_loss += loss.item()
        
        # Average loss over batches
        epoch_loss /= num_batches
        train_losses.append(epoch_loss)
        
        # Update SWA if we're past the start point
        if epoch >= swa_start:
            swa_model.update_parameters(model)
            swa_scheduler.step()
        elif scheduler is not None:
            scheduler.step()
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val)
            val_loss = criterion(val_outputs, y_val).item()
            val_losses.append(val_loss)
            
            val_preds = (torch.sigmoid(val_outputs) > 0.5).float()
            val_accuracy = (val_preds == y_val).float().mean().item()
            val_accuracies.append(val_accuracy)
        
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {epoch_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
        
        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = model.state_dict().copy()
            print(f"New best model saved! Validation Loss: {best_val_loss:.4f}")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered after {epoch+1} epochs")
                break
    
    # Load the best model state
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print("Loaded best model based on validation loss")
    
    # If we used SWA, finalize the SWA model
    if epoch >= swa_start:
        # Update batch norm statistics for SWA model
        torch.optim.swa_utils.update_bn(X_train_loader, swa_model)
        # Evaluate final SWA model
        swa_model.eval()
        with torch.no_grad():
            swa_outputs = swa_model(X_val)
            swa_loss = criterion(swa_outputs, y_val).item()
            swa_preds = (torch.sigmoid(swa_outputs) > 0.5).float()
            swa_accuracy = (swa_preds == y_val).float().mean().item()
        
        print(f"SWA Model - Val Loss: {swa_loss:.4f}, Val Accuracy: {swa_accuracy:.4f}")
        
        # If SWA model is better, use it instead
        if swa_accuracy > val_accuracies[-1]:
            print("Using SWA model as it performed better")
            # Copy SWA parameters to the original model
            for param_swa, param_model in zip(swa_model.parameters(), model.parameters()):
                param_model.data.copy_(param_swa.data)
    
    return model, train_losses, val_losses, val_accuracies

# Main execution
def run_ensemble_training(X_train, y_train, X_val, y_val):
    # Reduce dimensions
    X_train_reduced, X_val_reduced, pca, scaler = reduce_dimensions(X_train, X_val, n_components=150)
    
    # Convert to tensors
    X_train_torch = torch.tensor(X_train_reduced, dtype=torch.float32).to(device)
    y_train_torch = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)
    X_val_torch = torch.tensor(X_val_reduced, dtype=torch.float32).to(device)
    y_val_torch = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1).to(device)
    
    # Common training settings
    criterion = nn.BCEWithLogitsLoss()
    
    # Train different models for ensemble
    input_dim = X_train_reduced.shape[1]
    models = []
    
    # Model 1: Wide architecture
    print("\n=== Training Wide Model ===")
    model1 = WideModel(input_dim).to(device)
    optimizer1 = optim.AdamW(model1.parameters(), lr=0.001, weight_decay=1e-4)
    scheduler1 = optim.lr_scheduler.CosineAnnealingLR(optimizer1, T_max=100, eta_min=1e-5)
    model1, _, _, _ = train_model(model1, X_train_torch, y_train_torch, X_val_torch, y_val_torch, 
                                   criterion, optimizer1, scheduler1, epochs=200, batch_size=128, 
                                   patience=25, use_mixup=True, alpha=0.2)
    models.append(model1)
    
    # Model 2: Deep architecture
    print("\n=== Training Deep Model ===")
    model2 = DeepModel(input_dim).to(device)
    optimizer2 = optim.AdamW(model2.parameters(), lr=0.002, weight_decay=1e-5)
    scheduler2 = optim.lr_scheduler.ReduceLROnPlateau(optimizer2, mode='min', factor=0.5, patience=10)
    model2, _, _, _ = train_model(model2, X_train_torch, y_train_torch, X_val_torch, y_val_torch, 
                                   criterion, optimizer2, scheduler2, epochs=200, batch_size=64, 
                                   patience=25, use_mixup=True, alpha=0.3)
    models.append(model2)
    
    # Model 3: Compact architecture
    print("\n=== Training Compact Model ===")
    model3 = CompactModel(input_dim).to(device)
    optimizer3 = optim.Adam(model3.parameters(), lr=0.003, weight_decay=1e-6)
    scheduler3 = optim.lr_scheduler.StepLR(optimizer3, step_size=30, gamma=0.5)
    model3, _, _, _ = train_model(model3, X_train_torch, y_train_torch, X_val_torch, y_val_torch, 
                                   criterion, optimizer3, scheduler3, epochs=200, batch_size=256, 
                                   patience=25, use_mixup=False)
    models.append(model3)
    
    # Create ensemble
    ensemble = EnsembleModel(models)
    
    # Evaluate individual models
    train_accuracies = []
    val_accuracies = []
    
    for i, model in enumerate(models):
        model.eval()
        with torch.no_grad():
            # Training accuracy
            train_outputs = model(X_train_torch)
            train_preds = (torch.sigmoid(train_outputs) > 0.5).float()
            train_acc = (train_preds == y_train_torch).float().mean().item()
            train_accuracies.append(train_acc)
            
            # Validation accuracy
            val_outputs = model(X_val_torch)
            val_preds = (torch.sigmoid(val_outputs) > 0.5).float()
            val_acc = (val_preds == y_val_torch).float().mean().item()
            val_accuracies.append(val_acc)
            
        print(f"Model {i+1} - Train Accuracy: {train_acc:.4f}, Val Accuracy: {val_acc:.4f}")
    
    # Evaluate ensemble
    ensemble_pred = ensemble.predict(X_train_torch)
    ensemble_train_preds = (ensemble_pred > 0.5).float()
    ensemble_train_acc = (ensemble_train_preds == y_train_torch).float().mean().item()
    
    ensemble_val_pred = ensemble.predict(X_val_torch)
    ensemble_val_preds = (ensemble_val_pred > 0.5).float()
    ensemble_val_acc = (ensemble_val_preds == y_val_torch).float().mean().item()
    
    print(f"\nEnsemble - Train Accuracy: {ensemble_train_acc:.4f}, Val Accuracy: {ensemble_val_acc:.4f}")
    
    # Generate calibration curve for ensemble
    val_probs = ensemble_val_pred.cpu().numpy()
    
    plot_calibration_curve(y_val.ravel(), val_probs.ravel())
    
    return ensemble, models, pca, scaler

# Function to plot calibration curve
def plot_calibration_curve(y_true, y_prob):
    plt.figure(figsize=(8, 8))
    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))
    
    ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
    
    fraction_of_positives, mean_predicted_value = calibration_curve(
        y_true, y_prob, n_bins=10
    )
    
    ax1.plot(
        mean_predicted_value,
        fraction_of_positives,
        "s-",
        label=f"Ensemble Model"
    )
    
    ax1.set_ylabel("Fraction of positives")
    ax1.set_ylim([-0.05, 1.05])
    ax1.set_title("Calibration Curve")
    ax1.legend(loc="lower right")
    
    ax2.hist(y_prob, range=(0, 1), bins=10, histtype="step", lw=2)
    ax2.set_xlabel("Mean predicted value")
    ax2.set_ylabel("Count")
    
    plt.tight_layout()
    plt.show()

# Execute the ensemble training
ensemble, models, pca, scaler = run_ensemble_training(X_train, y_train, X_val, y_val)

# Final evaluation and summary
print("\nFinal Results:")
for i, model in enumerate(models):
    model.eval()
    with torch.no_grad():
        val_outputs = model(torch.tensor(X_val_reduced, dtype=torch.float32).to(device))
        val_preds = (torch.sigmoid(val_outputs) > 0.5).float()
        val_acc = (val_preds == torch.tensor(y_val, dtype=torch.float32).unsqueeze(1).to(device)).float().mean().item()
    print(f"Model {i+1} Validation Accuracy: {val_acc:.4f}")

ensemble_val_pred = ensemble.predict(torch.tensor(X_val_reduced, dtype=torch.float32).to(device))
ensemble_val_preds = (ensemble_val_pred > 0.5).float()
ensemble_val_acc = (ensemble_val_preds == torch.tensor(y_val, dtype=torch.float32).unsqueeze(1).to(device)).float().mean().item()
print(f"Ensemble Validation Accuracy: {ensemble_val_acc:.4f}")