In [25]:
import sys
import os
from datetime import datetime
import logging
import pstats

import wandb

import torch
from torch import nn
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

In [2]:
print(torch.__version__)

2.4.1+cu124


In [3]:
from DataSocket.DatasetMonoFlavourShard import DatasetMonoFlavourShard
from DataSocket.DatasetMultiFlavourShard import DatasetMultiFlavourShard
from DataSocket.DatasetMultiFlavourPart import DatasetMultiFlavourPart
from DataSocket.EnergyRange import EnergyRange
from DataSocket.MaxNDOMFinder import MaxNDOMFinder
from DataSocket.PMTfiedDataModule import PMTfiedDataModule
# fails after 10 sec RuntimeError: operator torchvision::nms does not exist
# re run succeeds, 1.3 sec

In [4]:
from Model.FlavourClassificationTransformerEncoder import FlavourClassificationTransformerEncoder 

In [5]:
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("No GPU detected.")

Number of GPUs available: 2
GPU 0: NVIDIA GeForce RTX 3090
GPU 1: NVIDIA GeForce RTX 3090


In [6]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # Change to "1" if you want to use the second GPU

In [7]:
root_dir = "/lustre/hpc/project/icecube/HE_Nu_Aske_Oct2024/PMTfied/Snowstorm/"
NuE_PeV_root = root_dir + "22015/"
NuMu_PeV_root = root_dir + "22012/"
NuTau_PeV_root = root_dir + "22018/"

truth_NuE_PeV_1 = NuE_PeV_root + "truth_1.parquet"
truth_NuMu_PeV_1 = NuMu_PeV_root + "truth_1.parquet"
truth_NuTau_PeV_1 = NuTau_PeV_root + "truth_1.parquet"

PMTfied_NuE_PeV_1 = NuE_PeV_root + "1/"
PMTfied_NuE_PeV_1_1 = PMTfied_NuE_PeV_1 + "PMTfied_1.parquet"

PMTfied_NuMu_PeV_1 = NuMu_PeV_root + "1/"
PMTfied_NuMu_PeV_1_1 = PMTfied_NuMu_PeV_1 + "PMTfied_1.parquet"

PMTfied_NuTau_PeV_1 = NuTau_PeV_root + "1/"
PMTfied_NuTau_PeV_1_1 = PMTfied_NuTau_PeV_1 + "PMTfied_1.parquet"

In [8]:
maxNDOMFinder_PeV_1_1 = MaxNDOMFinder(
    root_dir=root_dir,
    energy_band=EnergyRange.ER_1_PEV_100_PEV,
    part=1,
    shard=1,
    )

In [9]:
ds_PeV_1_1 = DatasetMultiFlavourShard(
    root_dir=root_dir,
    energy_band=EnergyRange.ER_1_PEV_100_PEV,
    part=1,
    shard=1,
    max_n_doms=maxNDOMFinder_PeV_1_1(),
    verbosity=1,
    )

------------- Multi-Flavour Shard (Energy Band: ER_1_PEV_100_PEV, Part: 1, Shard: 1) -------------


In [10]:
dm_PeV_1_1 = PMTfiedDataModule(
    root_dir=root_dir,
    energy_band=EnergyRange.ER_1_PEV_100_PEV,
    dataset = ds_PeV_1_1,
    batch_size=128,
    num_workers=8,
    verbosity=1,
    )

In [11]:
# md_transformer_PeV_1_1 = FlavourClassificationTransformerEncoder(
#     d_model=128,
#     n_heads=8,
#     d_f=256,
#     num_layers=3,
#     d_input=32, # the number of PMTfied features
#     num_classes=3, # the number of flavours: NuE, NuMu, NuTau
#     dropout=0.1,
#     learning_rate=1e-5,
#     nan_logger=None,
#     train_logger=None,
#     )
    

In [12]:
# current_date = datetime.now().strftime("%Y%m%d")
# current_time = datetime.now().strftime("%H%M%S")

# base_log_dir = os.path.join("logs", current_date)
# os.makedirs(base_log_dir, exist_ok=True)

# base_checkpoint_dir = os.path.join("checkpoints", current_date)
# os.makedirs(base_checkpoint_dir, exist_ok=True)

# # Training log
# train_log_filename = os.path.join(base_log_dir, f"{current_time}_training.log")

# train_logger = logging.getLogger("training")
# train_logger.setLevel(logging.INFO)
# train_handler = logging.FileHandler(train_log_filename)
# train_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
# train_handler.setFormatter(train_formatter)
# train_logger.addHandler(train_handler)

# # NaN log
# nan_log_filename = os.path.join(base_log_dir, f"{current_time}_nan_checks.log")
# nan_logger = logging.getLogger("nan_checks")
# nan_logger.setLevel(logging.INFO)
# nan_handler = logging.FileHandler(nan_log_filename)
# nan_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
# nan_handler.setFormatter(nan_formatter)
# nan_logger.addHandler(nan_handler)


In [13]:
# tb_logger = TensorBoardLogger(
#     save_dir=base_log_dir,
#     name=f"{current_time}",  # Add time to the logger name
# )

# # Set up the checkpoint callback
# checkpoint_callback = ModelCheckpoint(
#     monitor="val_loss",
#     dirpath=base_checkpoint_dir, 
#     filename=f"{current_time}_transformer-epoch{{epoch:02d}}-val_loss{{val_loss:.2f}}",  # Add time to filename
#     save_top_k=3,
#     mode="min"
# )

# # Set up the early stopping callback
# early_stopping_callback = EarlyStopping(
#     monitor="val_loss",
#     patience=10,
#     verbose=True,
#     mode="min"
# )

In [14]:
def setup_logger(name: str, log_filename: str, level=logging.INFO) -> logging.Logger:
    logger = logging.getLogger(name)
    logger.setLevel(level)
    
    # Create handler
    handler = logging.FileHandler(log_filename)
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    handler.setFormatter(formatter)
    
    # Avoid duplicate handlers
    if not logger.hasHandlers():
        logger.addHandler(handler)
    
    return logger

In [15]:
def setup_directories(base_dir: str, current_date: str, current_time: str):
    log_dir = os.path.join(base_dir, "logs", current_date)
    checkpoint_dir = os.path.join(base_dir, "checkpoints", current_date)
    
    # Create directories
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    return {
        "log_dir": log_dir,
        "checkpoint_dir": checkpoint_dir,
        "train_log_file": os.path.join(log_dir, f"{current_time}_training.log"),
        "nan_log_file": os.path.join(log_dir, f"{current_time}_nan_checks.log")
    }

In [16]:
def setup_callbacks(checkpoint_dir: str, current_time: str):
    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        dirpath=checkpoint_dir,
        filename=f"{current_time}_transformer-epoch{{epoch:02d}}-val_loss{{val_loss:.2f}}",
        save_top_k=3,
        mode="min",
    )

    early_stopping_callback = EarlyStopping(
        monitor="val_loss",
        patience=10,
        verbose=True,
        mode="min",
    )

    return [checkpoint_callback, early_stopping_callback]

In [17]:
def init_wandb(project_name: str, model_class: nn.Module):
    wandb.init(
        project=project_name,
        config={
            "d_model": model_class.d_model,
            "n_heads": model_class.n_heads,
            "d_f": model_class.d_f,
            "num_layers": model_class.num_layers,
            "d_input": model_class.d_input,
            "num_classes": model_class.num_classes,
            "dropout": model_class.dropout,
            "learning_rate": model_class.learning_rate,
            "epochs": 5,
            "attention": "Scaled Dot-Product",
        },
    )

In [18]:
def log_training_parameters(logger: logging.Logger, model_class: nn.Module):
    logger.info("Starting training...")
    logger.info(
        "| Parameter       | Value               |\n"
        "|-----------------|---------------------|\n"
        f"| attention       | Scaled Dot-Product |\n"
        f"| d_model         | {model_class.d_model:<15}|\n"
        f"| n_heads         | {model_class.n_heads:<15}|\n"
        f"| d_f             | {model_class.d_f:<15}|\n"
        f"| num_layers      | {model_class.num_layers:<15}|\n"
        f"| d_input         | {model_class.d_input:<15}|\n"
        f"| num_classes     | {model_class.num_classes:<15}|\n"
        f"| dropout         | {model_class.dropout:<15}|\n"
        f"| learning_rate   | {model_class.learning_rate:<15}|\n"
    )

In [19]:
def build_model(config: dict, nan_logger: logging.Logger, train_logger: logging.Logger):
    model = FlavourClassificationTransformerEncoder(
        d_model=config["d_model"],
        n_heads=config["n_heads"],
        d_f=config["d_f"],
        num_layers=config["num_layers"],
        d_input=config["d_input"],
        num_classes=config["num_classes"],
        dropout=config["dropout"],
        learning_rate=config["learning_rate"],
        nan_logger=nan_logger,
        train_logger=train_logger,
    )
    return model

In [20]:
def runTraining(base_dir: str, model_config: dict, datamodule: PMTfiedDataModule):
    current_date = datetime.now().strftime("%Y%m%d")
    current_time = datetime.now().strftime("%H%M%S")
    
    
    dirs = setup_directories(base_dir, current_date, current_time)
    
    train_logger = setup_logger("training", dirs["train_log_file"])
    nan_logger = setup_logger("nan_checks", dirs["nan_log_file"])
    
    model_class = build_model(model_config, nan_logger, train_logger)
    # TensorBoard logger
    tb_logger = TensorBoardLogger(
        save_dir=dirs["log_dir"],
        name=f"{current_time}",
    )

    callbacks = setup_callbacks(dirs["checkpoint_dir"], current_time)
    
    # Initialize WandB
    project_name = f"[{current_date}_{current_time}]Neutrino Flavour Classification"
    # init_wandb(project_name, model_class)
    
    # Log training parameters
    log_training_parameters(train_logger, model_class)

    # Set up Trainer
    trainer = Trainer(
        max_epochs=5,
        accelerator="gpu" if torch.cuda.is_available() else "cpu",
        devices=1, # use 2 for GPU or else for CPU
        gradient_clip_val=1.0,
        callbacks=callbacks,
        log_every_n_steps=1,
        logger=tb_logger,
    )
    
    # Start training
    trainer.fit(model_class, datamodule=datamodule)
    
    # Finalize WandB
    wandb.finish()
    return trainer

In [21]:
# def runTraining(model_class: nn.Module, datamodule: PMTfiedDataModule):
#     wandb.init(
#         project=f"[{current_date}_{current_time}]Neutrino Flavour Classification",
#         config={
#             "d_model": model_class.d_model,
#             "n_heads": model_class.n_heads,
#             "d_f": model_class.d_f,
#             "num_layers": model_class.num_layers,
#             "d_input": model_class.d_input,
#             "num_classes": model_class.num_classes,
#             "dropout": model_class.dropout,
#             "learning_rate": model_class.learning_rate,
#             "epochs": 5,
#             "attention": "Scaled Dot-Product",
#         },
#     )

#     train_logger.info(
#     "| Parameter       | Value               |\n"
#     "|-----------------|---------------------|\n"
#     f"| attention       | Scaled Dot-Product |\n"
#     f"| d_model         | {model_class.d_model:<15}|\n"
#     f"| n_heads         | {model_class.n_heads:<15}|\n"
#     f"| d_f             | {model_class.d_f:<15}|\n"
#     f"| num_layers      | {model_class.num_layers:<15}|\n"
#     f"| d_input         | {model_class.d_input:<15}|\n"
#     f"| num_classes     | {model_class.num_classes:<15}|\n"
#     f"| dropout         | {model_class.dropout:<15}|\n"
#     f"| learning_rate   | {model_class.learning_rate:<15}|\n\n"
#     )
#     train_logger.info("Starting training...")

#     trainer = Trainer(
#         max_epochs=5,
#         accelerator="gpu" if torch.cuda.is_available() else "cpu",
#         devices=1,
#         gradient_clip_val=1.0,
#         callbacks=[checkpoint_callback, early_stopping_callback],
#         log_every_n_steps=1,
#         logger=tb_logger
#     )
#     trainer.fit(model_class, datamodule=datamodule)
    
#     wandb.finish()
#     return trainer

# # trainer = runTraining(model_class)


In [22]:
config ={
    "d_model": 128,
    "n_heads": 8,
    "d_f": 256,
    "num_layers": 3,
    "d_input": 32,
    "num_classes": 3,
    "dropout": 0.1,
    "learning_rate": 1e-5,
    "epochs": 5,
    "attention": "Scaled Dot-Product",
}

In [23]:
base = '/groups/icecube/cyan/factory/IceCubeTransformer'

In [24]:
# trainer = runTraining(base, config, dm_PeV_1_1)

In [27]:
p = pstats.Stats("profiling_results")
p.strip_dirs().sort_stats("cumulative").print_stats()

Sat Feb  1 14:21:08 2025    profiling_results

         3781577 function calls (3698492 primitive calls) in 54.916 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    302/1    0.001    0.000   54.920   54.920 {built-in method builtins.exec}
        1    0.001    0.001   54.920   54.920 <string>:1(<module>)
        1    0.000    0.000   54.919   54.919 TrainingDebuggingYard.py:172(execute)
        1    0.000    0.000   52.129   52.129 TrainingDebuggingYard.py:127(runTraining)
        1    0.000    0.000   41.597   41.597 trainer.py:503(fit)
        1    0.000    0.000   41.597   41.597 call.py:31(_call_and_handle_interrupt)
        1    0.000    0.000   41.597   41.597 trainer.py:547(_fit_impl)
        1    0.000    0.000   41.597   41.597 trainer.py:909(_run)
        1    0.000    0.000   41.282   41.282 trainer.py:1017(_run_stage)
        1    0.001    0.001   40.951   40.951 fit_loop.py:196(run)
       50    0.001    0.

<pstats.Stats at 0x7f1047d47700>