In [1]:
import os
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from collections import Counter
import matplotlib.pyplot as plt

In [2]:
# --- Setup
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
OUT_DIR = "/content/flight_optuna_best"
os.makedirs(OUT_DIR, exist_ok=True)

# --- Synthetic balanced dataset
N_SEQ, TIMESTEPS, FEATURES = 8000, 60, 7
def generate_sequence(is_incident, timesteps=60):
    t = np.linspace(0,1,timesteps)
    alt = 10000 + 200*np.sin(2*np.pi*t) + np.random.normal(0,8,timesteps)
    spd = 250 + 8*np.cos(2*np.pi*t) + np.random.normal(0,3,timesteps)
    pitch, roll, yaw = np.random.normal(0,0.8,timesteps), np.random.normal(0,0.8,timesteps), np.random.normal(0,0.5,timesteps)
    eng, thrust = 200+np.random.normal(0,2,timesteps), 70+np.random.normal(0,2,timesteps)
    seq = np.vstack([alt,spd,pitch,roll,yaw,eng,thrust]).T
    weather, phase, maint = random.choice(["clear","rain","storm","fog","turbulence"]), random.choice(["cruise","takeoff","landing","climb","descent"]), np.random.randint(0,3)
    if is_incident:
        pat = random.choice(["rapid_descent","stall","engine_fail"])
        if pat=="rapid_descent": seq[:,0]-=np.linspace(0,np.random.randint(1000,1500),timesteps)
        if pat=="stall": seq[:,1]-=np.linspace(0,np.random.randint(60,120),timesteps); seq[:,2]+=np.linspace(0,np.random.randint(4,8),timesteps)
        if pat=="engine_fail": seq[:,5]+=np.linspace(0,np.random.randint(100,150),timesteps); seq[:,6]-=np.linspace(0,np.random.randint(30,50),timesteps)
        maint+=np.random.randint(1,2); weather=random.choice(["storm","turbulence","fog"])
    return seq, weather, phase, maint, int(is_incident)

seqs, metas, labels = [], [], []
for i in tqdm(range(N_SEQ)):
    seq,w,p,m,y = generate_sequence(i>=N_SEQ//2)
    seqs.append(seq); metas.append((w,p,m)); labels.append(y)
seqs=np.array(seqs); metas=pd.DataFrame(metas,columns=["weather","phase","maint"]); labels=np.array(labels)

100%|██████████| 8000/8000 [00:01<00:00, 6964.44it/s]


In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [4]:
def train_epoch(model, dataloader, criterion, optimizer, device, accumulation_steps=4):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for i, (sequences, metadata, labels) in enumerate(dataloader):
        sequences = sequences.to(device)
        metadata = metadata.to(device)
        labels = labels.to(device).unsqueeze(1)
        
        # forward pass
        outputs = model(sequences, metadata)
        loss = criterion(outputs, labels)
        
        # normalize loss by accumulation steps
        loss = loss / accumulation_steps
        loss.backward()
        
        # update weights every accumulation_steps
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        
        total_loss += loss.item() * accumulation_steps  # scale back the loss
        
        # Calculate accuracy
        predictions = (torch.sigmoid(outputs) > 0.5).float()
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

    return total_loss / len(dataloader), correct / total

In [5]:
def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.inference_mode():
        for sequences, metadata, labels in dataloader:
            sequences = sequences.to(device)
            metadata = metadata.to(device)
            labels = labels.to(device).unsqueeze(1)
            
            outputs = model(sequences, metadata)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            probs = torch.sigmoid(outputs)
            predictions = (probs > 0.5).float()
            
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    return total_loss / len(dataloader), correct / total

In [6]:
N, T, F = seqs.shape
print(f"Data shape: {N} samples, {T} timesteps, {F} features")

Data shape: 8000 samples, 60 timesteps, 7 features


In [7]:
# Preprocess metadata
cat_cols = ["weather", "phase"]
num_cols = ["maint"]

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols),
    ]
)

In [9]:
X_meta_processed = preprocessor.fit_transform(metas)

if hasattr(X_meta_processed, 'toarray'):
    X_meta_processed = X_meta_processed.toarray()
    
num_meta_features = X_meta_processed.shape[1]

In [10]:
# Train/validation split
X_seq_train, X_seq_val, X_meta_train, X_meta_val, y_train, y_val = train_test_split(
    seqs, X_meta_processed, labels, 
    test_size=0.2, 
    stratify=labels, 
    random_state=SEED
)

print(f"Training: {len(X_seq_train)} samples")
print(f"Validation: {len(X_seq_val)} samples")

Training: 6400 samples
Validation: 1600 samples


In [12]:
from transformer import AviationTransformer

# Create model
model = AviationTransformer(
    num_features=F,
    num_meta_features=num_meta_features,
    d_model=128,
    nhead=8,
    num_layers=4,
    dim_feedforward=512,
    dropout=0.15,
    max_seq_len=T
).to(device)

In [14]:
def find_optimal_batch_size(model, device, max_batch=512):
    """
    Binary search for the largest batch size that fits in memory,
    using the model's actual metadata dimension.
    """
    # Extract metadata dimension from model
    # meta_fc is: Linear(in_features=num_meta_features, out_features=64)
    num_meta = model.meta_fc[0].in_features

    batch_size = 32
    while batch_size <= max_batch:
        try:
            # Create dummy inputs with the correct shapes
            dummy_seq  = torch.randn(batch_size, 60, 7, device=device)
            dummy_meta = torch.randn(batch_size, num_meta, device=device)

            outputs = model(dummy_seq, dummy_meta)
            loss = outputs.sum()
            loss.backward()
            return batch_size  # success

        except RuntimeError as e:
            if 'out of memory' in str(e):
                # Too large: try half
                torch.cuda.empty_cache()
                batch_size //= 2
                if batch_size == 0:
                    raise RuntimeError("Not enough GPU memory even for batch_size=1")
            else:
                # Propagate other errors
                raise e

    return batch_size


In [15]:
optimal_batch = find_optimal_batch_size(model, device)
print(f"Optimal batch size: {optimal_batch}")

Optimal batch size: 32


In [16]:
# Compile the model with torch 2.0+
if hasattr(torch, 'compile'):
    model = torch.compile(
        model,
        mode='reduce-overhead',  # Options: 'default', 'reduce-overhead', 'max-autotune'
        fullgraph=True,          # Try to compile entire model as one graph
        dynamic=False            # Assumes fixed input shapes (faster)
    )
    print("Model compiled with torch.compile()")

Model compiled with torch.compile()


In [17]:
from dataset import AviationDataset

# Create datasets and dataloaders
train_dataset = AviationDataset(X_seq_train, X_meta_train, y_train)
val_dataset = AviationDataset(X_seq_val, X_meta_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=optimal_batch, shuffle=True, 
                          num_workers=4, pin_memory=True, prefetch_factor=2, 
                          persistent_workers=True)
val_loader = DataLoader(val_dataset, batch_size=optimal_batch, shuffle=False, 
                        num_workers=2, pin_memory=True, prefetch_factor=2, 
                        persistent_workers=True)

In [18]:
print("\nModel Architecture:")
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")


Model Architecture:
OptimizedModule(
  (_orig_mod): AviationTransformer(
    (input_projection): Linear(in_features=7, out_features=128, bias=True)
    (pos_encoder): PositionalEncoding(
      (dropout): Dropout(p=0.15, inplace=False)
    )
    (transformer_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-3): 4 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
          )
          (linear1): Linear(in_features=128, out_features=512, bias=True)
          (dropout): Dropout(p=0.15, inplace=False)
          (linear2): Linear(in_features=512, out_features=128, bias=True)
          (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.15, inplace=False)
          (dropout2): Dropout(p=0.15, inplace=False)
        )
      )
    )
    (m

In [19]:
# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(),
                              lr=0.001,
                              weight_decay=1e-4,
                              fused=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=8
)

In [20]:
# Calculate class weights
class_counts = Counter(y_train)
pos_weight = torch.tensor([class_counts[0] / class_counts[1]]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

In [21]:
os.makedirs('models', exist_ok=True)

In [22]:
# Training loop
num_epochs = 100
best_val_loss = float('inf')
patience = 15
patience_counter = 0

train_losses = []
val_losses = []
train_accs = []
val_accs = []

print("\nStarting training...")
for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = validate(
        model, val_loader, criterion, device
    )
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accs.append(train_acc)
    val_accs.append(val_acc)
    
    scheduler.step(val_loss)
    
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"  Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'models/best_aviation_transformer.pt')
        print("  → Model saved!")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"\nEarly stopping after {epoch+1} epochs")
            break



Starting training...


cudagraph partition due to non gpu ops
cudagraph partition due to non gpu ops. Found from : 
   File "c:\Users\Marc\Desktop\Programming\LABS\FDL-REPORT\transformer.py", line 78, in forward
    meta_features = self.meta_fc(meta_input)  # (batch, 64)
  File "c:\Users\Marc\anaconda3\envs\casenv\lib\site-packages\torch\nn\modules\container.py", line 250, in forward
    input = module(input)
  File "c:\Users\Marc\anaconda3\envs\casenv\lib\site-packages\torch\nn\modules\linear.py", line 134, in forward
    return F.linear(input, self.weight, self.bias)

cudagraph partition due to non gpu ops. Found from : 
   File "c:\Users\Marc\Desktop\Programming\LABS\FDL-REPORT\transformer.py", line 66, in forward
    x = self.input_projection(seq_input)  # (batch, seq_len, d_model)
  File "c:\Users\Marc\anaconda3\envs\casenv\lib\site-packages\torch\nn\modules\linear.py", line 134, in forward
    return F.linear(input, self.weight, self.bias)

cudagraph partition due to non gpu ops. Found from : 
   File 

InductorError: RuntimeError: Compiler: cl is not found.

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


In [None]:
# Load best model
model.load_state_dict(torch.load('best_aviation_transformer.pt'))

In [None]:
import numpy as np
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score
)

def full_validate(model, dataloader, criterion, device):
    # Run your existing validate to get loss and accuracy
    val_loss, val_acc = validate(model, dataloader, criterion, device)

    # Now collect all labels, predictions, and probabilities
    model.eval()
    all_labels, all_preds, all_probs = [], [], []
    with torch.inference_mode():
        for seqs, metas, labels in dataloader:
            seqs, metas = seqs.to(device), metas.to(device)
            labels = labels.to(device).unsqueeze(1)
            outputs = model(seqs, metas)
            probs = torch.sigmoid(outputs)
            preds = (probs > 0.5).float()

            all_labels.extend(labels.cpu().numpy().flatten())
            all_preds.extend(preds.cpu().numpy().flatten())
            all_probs.extend(probs.cpu().numpy().flatten())

    # Convert to numpy arrays
    y_true = np.array(all_labels)
    y_pred = np.array(all_preds)
    y_prob = np.array(all_probs)

    # Compute standard metrics
    acc   = accuracy_score(y_true, y_pred)
    prec  = precision_score(y_true, y_pred, zero_division=0)
    rec   = recall_score(y_true, y_pred, zero_division=0)
    f1    = f1_score(y_true, y_pred, zero_division=0)
    roc   = roc_auc_score(y_true, y_prob) if len(np.unique(y_true))>1 else None
    prauc = average_precision_score(y_true, y_prob)

    # Confusion matrix and classification report
    cm     = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred, digits=4)

    # Print results
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Accuracy:        {acc:.4f}")
    print(f"Precision:       {prec:.4f}")
    print(f"Recall:          {rec:.4f}")
    print(f"F1 Score:        {f1:.4f}")
    if roc is not None:
        print(f"ROC AUC:         {roc:.4f}")
    print(f"PR AUC:          {prauc:.4f}")
    print("\nConfusion Matrix:")
    print(cm)
    print("\nClassification Report:")
    print(report)

    return {
        "loss": val_loss,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "roc_auc": roc,
        "pr_auc": prauc,
        "confusion_matrix": cm,
        "classification_report": report
    }


In [None]:
metrics = full_validate(model, val_loader, criterion, device)