In [None]:
import os
import torch
import pandas as pd
import matplotlib.pyplot as plt
import logging
from tqdm import tqdm
from torch_geometric.loader import DataLoader
from torch.utils.data import random_split
from functools import partial
import optuna
import gc
from typing import Literal

# Load utility functions from cloned repository
from src.loadData import GraphDataset
from src.utils import set_seed
from src.models import GNN


# Set the random seed
set_seed()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def add_zeros(data):
    data.x = torch.zeros(data.num_nodes, dtype=torch.long)
    return data

In [3]:
def train(
    data_loader,
    model,
    optimizer,
    criterion,
    device,
    save_checkpoints,
    checkpoint_path,
    current_epoch,
):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for data in data_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        pred = output.argmax(dim=1)
        correct += (pred == data.y).sum().item()
        total += data.y.size(0)

    # Save checkpoints if required
    if save_checkpoints:
        checkpoint_file = f"{checkpoint_path}_epoch_{current_epoch + 1}.pth"
        torch.save(model.state_dict(), checkpoint_file)
        print(f"Checkpoint saved at {checkpoint_file}")

    return total_loss / len(data_loader), correct / total

In [4]:
def evaluate(data_loader, model, device, calculate_accuracy=False):
    model.eval()
    correct = 0
    total = 0
    predictions = []
    total_loss = 0
    criterion = torch.nn.CrossEntropyLoss()
    with torch.no_grad():
        for data in data_loader:
            data = data.to(device)
            output = model(data)
            pred = output.argmax(dim=1)

            if calculate_accuracy:
                correct += (pred == data.y).sum().item()
                total += data.y.size(0)
                total_loss += criterion(output, data.y).item()
            else:
                predictions.extend(pred.cpu().numpy())
    if calculate_accuracy:
        accuracy = correct / total
        return total_loss / len(data_loader), accuracy
    return predictions

In [5]:
def save_predictions(predictions, test_path):
    script_dir = os.getcwd()
    submission_folder = os.path.join(script_dir, "submission")
    test_dir_name = os.path.basename(os.path.dirname(test_path))

    os.makedirs(submission_folder, exist_ok=True)

    output_csv_path = os.path.join(submission_folder, f"testset_{test_dir_name}.csv")

    test_graph_ids = list(range(len(predictions)))
    output_df = pd.DataFrame({"id": test_graph_ids, "pred": predictions})

    output_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")

In [6]:
def plot_training_progress(train_losses, train_accuracies, output_dir):
    epochs = range(1, len(train_losses) + 1)
    plt.figure(figsize=(12, 6))

    # Plot loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label="Training Loss", color="blue")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training Loss per Epoch")

    # Plot accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, label="Training Accuracy", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Training Accuracy per Epoch")

    # Save plots in the current directory
    os.makedirs(output_dir, exist_ok=True)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "training_progress.png"))
    plt.close()

In [7]:
class NoisyCrossEntropyLoss(torch.nn.Module):
    def __init__(self, p_noisy):
        super().__init__()
        self.p = p_noisy
        self.ce = torch.nn.CrossEntropyLoss(reduction="none")

    def forward(self, logits, targets):
        losses = self.ce(logits, targets)
        weights = (1 - self.p) + self.p * (
            1
            - torch.nn.functional.one_hot(targets, num_classes=logits.size(1))
            .float()
            .sum(dim=1)
        )
        return (losses * weights).mean()

In [None]:
def objective(
    trial,
    train_loader,
    val_loader,
    num_checkpoints,
    checkpoints_dir,
    run_name,
    best_model_path,
    logs_dir,
):  # -> float | Any:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logging.info("#" * 80)
    # Hyperparameter search space
    logging.info("Start case study with parameters:")
    gnn_type = trial.suggest_categorical(
        "gnn_type", ["gin", "gin-virtual", "gcn", "gcn-virtual"]
    )
    drop_ratio = trial.suggest_float("dropout", 0.0, 0.7)
    num_layers = trial.suggest_int("num_layers", 3, 6)
    embedding_dim = trial.suggest_categorical("embedding_dim", [64, 128, 300, 600])
    num_epochs = trial.suggest_int("num_epochs", 50, 200, step=50)

    logging.info(f"{gnn_type=}")
    logging.info(f"{drop_ratio=}")
    logging.info(f"{num_layers=}")
    logging.info(f"{embedding_dim=}")
    logging.info(f"{num_epochs=}")

    # Initialize model
    model = GNN(
        gnn_type="gin" if "gin" in gnn_type else "gcn",
        num_class=6,
        num_layer=num_layers,
        emb_dim=embedding_dim,
        drop_ratio=drop_ratio,
        virtual_node="virtual" in gnn_type,
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters())
    criterion = torch.nn.CrossEntropyLoss()

    # Prepare checkpoints
    checkpoint_epochs = [
        int((i + 1) * num_epochs / num_checkpoints) for i in range(num_checkpoints)
    ]

    best_val_accuracy = 0.0
    train_losses, train_accuracies = [], []
    val_losses, val_accuracies = [], []

    for epoch in tqdm(range(num_epochs), desc="Epoch"):
        train_loss, train_acc = train(
            train_loader,
            model,
            optimizer,
            criterion,
            device,
            save_checkpoints=(epoch + 1 in checkpoint_epochs),
            checkpoint_path=os.path.join(checkpoints_dir, f"model_{run_name}"),
            current_epoch=epoch,
        )

        val_loss, val_acc = evaluate(val_loader, model, device, calculate_accuracy=True)

        msg = (
            f"[{run_name}] Epoch {epoch + 1}/{num_epochs} | "
            f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} | "
            f"Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}"
        )
        print(msg)
        logging.info(msg)

        train_losses.append(train_loss)
        train_accuracies.append(train_acc)
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)

        if val_acc > best_val_accuracy:
            best_val_accuracy = val_acc
            torch.save(model.state_dict(), best_model_path)
            logging.info(f"[{run_name}] Best model updated at {best_model_path}")

    plot_training_progress(
        train_losses, train_accuracies, os.path.join(logs_dir, "train_plots")
    )
    plot_training_progress(
        val_losses, val_accuracies, os.path.join(logs_dir, "val_plots")
    )
    logging.info(f"Case study end, {best_val_accuracy}")
    logging.info("#" * 80)
    logging.info("\n")

    return best_val_accuracy

In [None]:
def case_study(
    dataset_name: Literal["A","B","C","D"],
    n_trials: int = 30,
    resume_if_exists: bool = True,
    num_checkpoints: int = 10,
    default_batch_size: int = 32,
    summary_csv_path: str = "optuna_summary.csv",
):
    script_root = os.getcwd()
    train_path = f"./datasets/{dataset_name}/train.json.gz"
    run_name = dataset_name

    logs_dir = os.path.join(script_root, "logs", run_name)
    os.makedirs(logs_dir, exist_ok=True)
    logging.basicConfig(
        filename=os.path.join(logs_dir, "training.log"),
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        filemode="w",
    )
    logging.getLogger().addHandler(logging.StreamHandler())

    checkpoints_dir = os.path.join(script_root, "checkpoints", run_name)
    best_model_path = os.path.join(checkpoints_dir, f"model_{run_name}_best.pth")
    os.makedirs(checkpoints_dir, exist_ok=True)

    # Load or initialize results summary
    if os.path.exists(summary_csv_path):
        results_df = pd.read_csv(summary_csv_path)
    else:
        results_df = pd.DataFrame()

    # Skip if already present and resume flag is set
    if resume_if_exists and dataset_name in results_df.get("dataset", []):
        print(f"✅ Skipping {dataset_name}, already completed in summary.")
        display(results_df)
        return

    # Dataset loading
    full_dataset = GraphDataset(train_path, transform=add_zeros)
    val_size = int(0.2 * len(full_dataset))
    train_size = len(full_dataset) - val_size
    generator = torch.Generator().manual_seed(12)
    train_dataset, val_dataset = random_split(
        full_dataset, [train_size, val_size], generator=generator
    )

    train_loader = DataLoader(
        train_dataset, # type: ignore
        batch_size=default_batch_size,
        shuffle=True,
    )
    val_loader = DataLoader(
        val_dataset,# type: ignore
        batch_size=default_batch_size,
        shuffle=False,
    )

    # Run Optuna study
    print(f"\n--- Optimizing for dataset {dataset_name} ---")
    logging.info(f"--- Starting Optuna optimization for dataset {dataset_name} ---")

    study = optuna.create_study(study_name=run_name, direction="maximize")

    obj = partial(
        objective,
        train_loader=train_loader,
        val_loader=val_loader,
        num_checkpoints=num_checkpoints,
        checkpoints_dir=checkpoints_dir,
        run_name=run_name,
        best_model_path=best_model_path,
        logs_dir=logs_dir,
    )
    study.optimize(obj, n_trials=n_trials)

    # Record best result
    row = {"dataset": dataset_name, "best_accuracy": study.best_value}
    row.update(study.best_params)
    results_df = pd.concat(
        [results_df[results_df["dataset"] != dataset_name], pd.DataFrame([row])],
        ignore_index=True,
    )
    results_df.to_csv(summary_csv_path, index=False)

    print(f"\n Best result for dataset {dataset_name}:")
    display(results_df)
    print(f"\n Best Params for {dataset_name}:")
    for k, v in study.best_params.items():
        print(f"  {k}: {v}")

    # Cleanup
    del train_loader, val_loader, full_dataset, train_dataset, val_dataset
    gc.collect()

SyntaxError: expected default value expression (170094642.py, line 2)

In [None]:
from tqdm import trange
from time import sleep
from tqdm.notebook import tqdm
num_epochs =  4
progress_bar = tqdm(range(num_epochs), leave=False)

for epoch in progress_bar:
    train_loss, train_conf, train_acc, train_entropy = 1,2,3,4#train(...)
    val_loss, val_conf, val_acc, val_entropy = 5,6,7,8 #evaluate(...)

    tqdm.write(  # prints *below* the bar without disrupting it
        f"Epoch {epoch + 1}/{num_epochs}:\n"
        f"\tTrain - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, Conf: {train_conf:.4f}, Ent: {train_entropy:.4f}\n"
        f"\tVal -   Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, Conf: {val_conf:.4f}, Ent: {val_entropy:.4f}\n"
    )
    sleep(2)
    # logging.info(
    #     f"Epoch {epoch + 1}/{num_epochs}\n"
    #     f"\tTrain Loss: {train_loss:.4f}, Acc: {train_acc:.4f}, Conf: {train_conf:.4f}, Entropy: {train_entropy:.4f}\n"
    #     f"\tVal   Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, Conf: {val_conf:.4f}, Entropy: {val_entropy:.4f}"
    # )

progress_bar.close()



  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 01/4 |
	Train - Loss: 1.0000, Acc: 3.0000, Conf: 2.0000, Ent: 4.0000 |
	Val - Loss: 5.0000, Acc: 7.0000, Conf: 6.0000, Ent: 8.0000

Epoch 02/4 |
	Train - Loss: 1.0000, Acc: 3.0000, Conf: 2.0000, Ent: 4.0000 |
	Val - Loss: 5.0000, Acc: 7.0000, Conf: 6.0000, Ent: 8.0000

Epoch 03/4 |
	Train - Loss: 1.0000, Acc: 3.0000, Conf: 2.0000, Ent: 4.0000 |
	Val - Loss: 5.0000, Acc: 7.0000, Conf: 6.0000, Ent: 8.0000

Epoch 04/4 |
	Train - Loss: 1.0000, Acc: 3.0000, Conf: 2.0000, Ent: 4.0000 |
	Val - Loss: 5.0000, Acc: 7.0000, Conf: 6.0000, Ent: 8.0000

[F[K

In [None]:
case_study("A")