In [1]:
# !pip install https://github.com/pyg-team/pytorch_geometric.git

In [2]:
# !git clone --branch baselineCe https://github.com/Graph-Classification-Noisy-Label/hackaton.git

In [3]:
%cd hackaton/

[Errno 2] No such file or directory: 'hackaton/'
/home/valerio/Desktop/noisy_labels/hackaton


  bkms = self.shell.db.get('bookmarks', {})


In [4]:
# !gdown --folder https://drive.google.com/drive/folders/1Z-1JkPJ6q4C6jX4brvq1VRbJH5RPUCAk -O datasets


In [5]:
!ls -lh datasets

total 4.0K
drwxrwxr-x 6 valerio valerio 4.0K May 25 13:03 data


In [6]:
import os
import torch
import pandas as pd
import matplotlib.pyplot as plt
import logging
from tqdm import tqdm
from torch_geometric.loader import DataLoader
from torch.utils.data import random_split
# Load utility functions from cloned repository
from src.loadData import GraphDataset
from src.utils import set_seed
from src.models import GNN
import argparse


# Set the random seed
set_seed()


Folder '/home/valerio/Desktop/noisy_labels/hackaton/submission' has been compressed into './submission.tar.gz'


In [7]:
def add_zeros(data):
    data.x = torch.zeros(data.num_nodes, dtype=torch.long)
    return data

In [8]:
def train(data_loader, model, optimizer, criterion, device, save_checkpoints, checkpoint_path, current_epoch):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for data in tqdm(data_loader, desc="Iterating training graphs", unit="batch"):
        data = data.to(device)
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
        pred = output.argmax(dim=1)
        correct += (pred == data.y).sum().item()
        total += data.y.size(0)

    # Save checkpoints if required
    if save_checkpoints:
        checkpoint_file = f"{checkpoint_path}_epoch_{current_epoch + 1}.pth"
        torch.save(model.state_dict(), checkpoint_file)
        print(f"Checkpoint saved at {checkpoint_file}")

    return total_loss / len(data_loader),  correct / total

In [9]:
from sklearn.metrics import f1_score
from tqdm import tqdm
import torch

def evaluate(data_loader, model, device, calculate_accuracy=False):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    criterion = torch.nn.CrossEntropyLoss()

    with torch.no_grad():
        for data in tqdm(data_loader, desc="Iterating eval graphs", unit="batch"):
            data = data.to(device)
            logits = model(data)
            pred = logits.argmax(dim=1)

            if calculate_accuracy:
                loss = criterion(logits, data.y)
                total_loss += loss.item()
                correct += (pred == data.y).sum().item()
                total += data.y.size(0)

                all_preds.extend(pred.cpu().tolist())
                all_labels.extend(data.y.cpu().tolist())
            else:
                all_preds.extend(pred.cpu().tolist())

    if calculate_accuracy:
        avg_loss = total_loss / len(data_loader)
        accuracy = correct / total
        macro_f1 = f1_score(all_labels, all_preds, average='macro')
        return avg_loss, accuracy, macro_f1

    return all_preds


In [10]:
def save_predictions(predictions, test_path):
    script_dir = os.getcwd() 
    submission_folder = os.path.join(script_dir, "submission")
    test_dir_name = os.path.basename(os.path.dirname(test_path))
    
    os.makedirs(submission_folder, exist_ok=True)
    
    output_csv_path = os.path.join(submission_folder, f"testset_{test_dir_name}.csv")
    
    test_graph_ids = list(range(len(predictions)))
    output_df = pd.DataFrame({
        "id": test_graph_ids,
        "pred": predictions
    })
    
    output_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")

In [11]:
def plot_training_progress(train_losses, train_accuracies, output_dir):
    epochs = range(1, len(train_losses) + 1)
    plt.figure(figsize=(12, 6))

    # Plot loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label="Training Loss", color='blue')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss per Epoch')

    # Plot accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, label="Training Accuracy", color='green')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Training Accuracy per Epoch')

    # Save plots in the current directory
    os.makedirs(output_dir, exist_ok=True)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "training_progress.png"))
    plt.close()

In [12]:
def get_user_input(prompt, default=None, required=False, type_cast=str):

    while True:
        user_input = ""#input(f"{prompt} [{default}]: ")
        
        if user_input == "" and required:
            print("This field is required. Please enter a value.")
            continue
        
        if user_input == "" and default is not None:
            return default
        
        if user_input == "" and not required:
            return None
        
        try:
            return type_cast(user_input)
        except ValueError:
            print(f"Invalid input. Please enter a valid {type_cast.__name__}.")

In [None]:
def get_arguments():
    args = {}
    args['train_path'] = get_user_input("Path to the training dataset (optional)", default = "./datasets/data/C/train.json.gz")
    args['test_path'] = get_user_input("Path to the test dataset",default = "./datasets/data/C/test.json.gz")
    args['num_checkpoints'] = get_user_input("Number of checkpoints to save during training", type_cast=int)
    args['device'] = get_user_input("Which GPU to use if any", default=0, type_cast=int)
    args['gnn'] = get_user_input("GNN type (gin, gin-virtual, gcn, gcn-virtual)", default='gin-virtual')
    args['drop_ratio'] = get_user_input("Dropout ratio", default=0.1, type_cast=float)
    args['num_layer'] = get_user_input("Number of GNN message passing layers", default=5, type_cast=int)
    args['emb_dim'] = get_user_input("Dimensionality of hidden units in GNNs", default=300, type_cast=int)
    args['batch_size'] = get_user_input("Input batch size for training", default=32, type_cast=int)
    args['epochs'] = get_user_input("Number of epochs to train", default=50, type_cast=int)
    args['baseline_mode'] = get_user_input("Baseline mode: 1 (CE), 2 (Noisy CE)", default=2, type_cast=int)
    args['noise_prob'] = get_user_input("Noise probability p (used if baseline_mode=2)", default=0.2, type_cast=float)

    
    return argparse.Namespace(**args)


In [14]:
def populate_args(args):
    print("Arguments received:")
    for key, value in vars(args).items():
        print(f"{key}: {value}")
args = get_arguments()
populate_args(args)

Arguments received:
train_path: ./datasets/data/B/train.json.gz
test_path: ./datasets/data/B/test.json.gz
num_checkpoints: None
device: 0
gnn: gin-virtual
drop_ratio: 0.1
num_layer: 5
emb_dim: 300
batch_size: 32
epochs: 50
baseline_mode: 2
noise_prob: 0.2


In [15]:
class NoisyCrossEntropyLoss(torch.nn.Module):
    def __init__(self, p_noisy):
        super().__init__()
        self.p = p_noisy
        self.ce = torch.nn.CrossEntropyLoss(reduction='none')

    def forward(self, logits, targets):
        losses = self.ce(logits, targets)
        weights = (1 - self.p) + self.p * (1 - torch.nn.functional.one_hot(targets, num_classes=logits.size(1)).float().sum(dim=1))
        return (losses * weights).mean()

In [16]:
script_dir = os.getcwd() 
# device = torch.device(f"cuda:{args.device}" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_checkpoints = args.num_checkpoints if args.num_checkpoints else 3
    
if args.gnn == 'gin':
    model = GNN(gnn_type='gin', num_class=6, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device)
elif args.gnn == 'gin-virtual':
    model = GNN(gnn_type='gin', num_class=6, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device)
elif args.gnn == 'gcn':
    model = GNN(gnn_type='gcn', num_class=6, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device)
elif args.gnn == 'gcn-virtual':
    model = GNN(gnn_type='gcn', num_class=6, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device)
else:
    raise ValueError('Invalid GNN type')
    
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# criterion = torch.nn.CrossEntropyLoss()

if args.baseline_mode == 2:
    criterion = NoisyCrossEntropyLoss(args.noise_prob)
else:
    criterion = torch.nn.CrossEntropyLoss()

In [17]:
test_dir_name = os.path.basename(os.path.dirname(args.test_path))
logs_folder = os.path.join(script_dir, "logs", test_dir_name)
log_file = os.path.join(logs_folder, "training.log")
os.makedirs(os.path.dirname(log_file), exist_ok=True)
logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(message)s')
logging.getLogger().addHandler(logging.StreamHandler())

checkpoint_path = os.path.join(script_dir, "checkpoints", f"model_{test_dir_name}_best.pth")
checkpoints_folder = os.path.join(script_dir, "checkpoints", test_dir_name)
os.makedirs(checkpoints_folder, exist_ok=True)


In [18]:
# if os.path.exists(checkpoint_path) and not args.train_path:
#     model.load_state_dict(torch.load(checkpoint_path))
#     print(f"Loaded best model from {checkpoint_path}")

In [19]:
def plot_all_metrics(train_losses, train_accuracies, val_losses, val_accuracies, save_dir):
    """
    Plots and saves a single figure containing training/validation loss and accuracy over epochs.
    """
    os.makedirs(save_dir, exist_ok=True)
    epochs = range(1, len(train_losses) + 1)
    plt.figure(figsize=(10, 6))
    # Loss curves
    plt.plot(epochs, train_losses, label='Train Loss', linestyle='-')
    plt.plot(epochs, val_losses, label='Val Loss', linestyle='--')
    # Accuracy curves
    plt.plot(epochs, train_accuracies, label='Train Acc', linestyle='-.')
    plt.plot(epochs, val_accuracies, label='Val Acc', linestyle=':')

    plt.xlabel('Epoch')
    plt.ylabel('Value')
    plt.title('Training and Validation Metrics')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    save_path = os.path.join(save_dir, 'all_metrics.png')
    plt.savefig(save_path)
    plt.close()
    print(f"All metrics plot saved to {save_path}")


In [20]:
from cleanlab.filter   import find_label_issues
from torch.utils.data import Subset
from torch_geometric.loader import DataLoader
import numpy as np

def pretrain(train_loader, model, optimizer, criterion, device, epochs=10):
    for ep in range(epochs):
        loss, acc = train(
            train_loader, model, optimizer, criterion,
            device, save_checkpoints=False,
            checkpoint_path=None, current_epoch=ep
        )
        print(f"[Pretrain {ep+1}/{epochs}] loss={loss:.4f}, acc={acc:.4f}")

def estimate_and_prune_noisy_labels(data_loader, dataset, device, batch_size, num_workers):
    model.eval()
    psx_list, y_list = [], []
    with torch.no_grad():
        for data in data_loader:
            data = data.to(device)
            logits = model(data)
            psx_list.append(torch.softmax(logits, dim=1).cpu().numpy())
            y_list.append(data.y.cpu().numpy())
    psx = np.vstack(psx_list)  # (N, C)
    y   = np.hstack(y_list)    # (N,)

    # Individua tutte le etichette ritenute "issue" da CleanLab
    noise_mask = find_label_issues(
        labels=y,
        pred_probs=psx,
        filter_by='both'   # filtra by_noise_rate & by_class
    )
    clean_idx = np.where(~noise_mask)[0]

    # Crea il nuovo DataLoader “pulito-like”
    clean_dataset = Subset(dataset, clean_idx)
    clean_loader  = DataLoader(
        clean_dataset,
        batch_size=batch_size,
        shuffle=(data_loader.shuffle if hasattr(data_loader, 'shuffle') else False),
        num_workers=num_workers
    )
    print(f"Filtered set: {len(clean_idx)}/{len(y)} samples kept")
    return clean_loader


  from .autonotebook import tqdm as notebook_tqdm


In [21]:
from collections import Counter

if args.train_path:
    # Carico l’intero dataset e lo splitto

    full_dataset = GraphDataset(args.train_path, transform=add_zeros)
    val_size = int(0.2 * len(full_dataset))
    train_size = len(full_dataset) - val_size

    generator = torch.Generator().manual_seed(12)
    train_dataset, val_dataset = random_split(
        full_dataset, [train_size, val_size], generator=generator
    )

    labels = [d.y.item() for d in train_dataset]
    label_counts = Counter(labels)
    total = len(labels)
    num_classes = len(label_counts) 

    # weights = torch.tensor([total / (num_classes * label_counts[c]) for c in range(num_classes)], 
    #                        dtype=torch.float,
    #                        device=device)

    # if args.baseline_mode == 1:
    #     criterion = torch.nn.CrossEntropyLoss(weight=weights)

    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4)

    pretrain(train_loader, model, optimizer, criterion, device, epochs=40)
    
    train_loader = estimate_and_prune_noisy_labels(
        train_loader, train_dataset,
        device,
        batch_size=args.batch_size,
        num_workers=4
    )

    # 3) Filtra anche val_loader (riusa noise_mat dal train)
    val_loader = estimate_and_prune_noisy_labels(
        val_loader, val_dataset,
        device,
        batch_size=args.batch_size,
        num_workers=4
    )
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    num_epochs = 20
    best_val_accuracy = 0.0   
    # best_f1 = 0.0
    # best_epoch = -1
    # patience = 0
    # wait = 0

    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []

    if num_checkpoints > 1:
        checkpoint_intervals = [int((i + 1) * num_epochs / num_checkpoints) for i in range(num_checkpoints)]
    else:
        checkpoint_intervals = [num_epochs]

    for epoch in range(num_epochs):
        train_loss, train_acc = train(
            train_loader, model, optimizer, criterion, device,
            save_checkpoints=(epoch + 1 in checkpoint_intervals),
            checkpoint_path=os.path.join(checkpoints_folder, f"model_{test_dir_name}"),
            current_epoch=epoch
        )

        val_loss, val_acc, val_f1 = evaluate(val_loader, model, device, calculate_accuracy=True)
        print(
            f"Epoch {epoch+1}/{num_epochs} | "
            f"Train Acc: {train_acc:.4f} | "
            f"Val Acc: {val_acc:.4f} | "
            f"Val Macro-F1: {val_f1:.4f} | "
            f"Val Loss: {val_loss:.4f}"
        )
        logging.info(
            f"Epoch {epoch + 1}/{num_epochs}, "
            f"Loss: {train_loss:.4f}, "
            f"Train Acc: {train_acc:.4f}, "
            f"Val Acc: {val_acc:.4f}, "
            f"Val Macro-F1: {val_f1:.4f}, "
            f"Val Loss: {val_loss:.4f}"
        )


        #print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

        train_losses.append(train_loss)
        train_accuracies.append(train_acc)
        val_losses.append(val_loss)
        val_accuracies.append(val_acc)

        # if val_f1 > best_f1 + 1e-4:
        #     best_epoch = epoch + 1
        #     best_f1 = val_f1
            
        if val_acc > best_val_accuracy:
            best_val_accuracy = val_acc
            torch.save(model.state_dict(), checkpoint_path)
            print(f"Best model updated and saved at {checkpoint_path}")

    plot_training_progress(train_losses, train_accuracies, os.path.join(logs_folder, "plots"))
    plot_training_progress(val_losses, val_accuracies, os.path.join(logs_folder, "plotsVal"))
    plot_all_metrics(train_losses, train_accuracies, val_losses, val_accuracies, os.path.join(logs_folder, "plots_all_1"))



Iterating training graphs: 100%|██████████| 140/140 [00:09<00:00, 15.50batch/s]


[Pretrain 1/60] loss=1.4219, acc=0.2687


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.26batch/s]


[Pretrain 2/60] loss=1.3920, acc=0.2868


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 15.63batch/s]


[Pretrain 3/60] loss=1.3908, acc=0.2940


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 15.67batch/s]


[Pretrain 4/60] loss=1.3742, acc=0.3134


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.84batch/s]


[Pretrain 5/60] loss=1.3647, acc=0.3208


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.77batch/s]


[Pretrain 6/60] loss=1.3629, acc=0.3259


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.92batch/s]


[Pretrain 7/60] loss=1.3511, acc=0.3440


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.90batch/s]


[Pretrain 8/60] loss=1.3259, acc=0.3621


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.77batch/s]


[Pretrain 9/60] loss=1.3248, acc=0.3634


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.45batch/s]


[Pretrain 10/60] loss=1.3051, acc=0.3795


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.33batch/s]


[Pretrain 11/60] loss=1.3011, acc=0.3882


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.64batch/s]


[Pretrain 12/60] loss=1.2856, acc=0.3958


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.30batch/s]


[Pretrain 13/60] loss=1.2759, acc=0.4118


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.65batch/s]


[Pretrain 14/60] loss=1.2858, acc=0.3940


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.77batch/s]


[Pretrain 15/60] loss=1.2703, acc=0.4150


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.67batch/s]


[Pretrain 16/60] loss=1.2530, acc=0.4328


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.81batch/s]


[Pretrain 17/60] loss=1.2613, acc=0.4214


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.80batch/s]


[Pretrain 18/60] loss=1.2538, acc=0.4306


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.55batch/s]


[Pretrain 19/60] loss=1.2419, acc=0.4429


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.37batch/s]


[Pretrain 20/60] loss=1.2404, acc=0.4404


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.57batch/s]


[Pretrain 21/60] loss=1.2364, acc=0.4437


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.52batch/s]


[Pretrain 22/60] loss=1.2271, acc=0.4574


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.50batch/s]


[Pretrain 23/60] loss=1.2313, acc=0.4480


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.82batch/s]


[Pretrain 24/60] loss=1.2230, acc=0.4538


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.78batch/s]


[Pretrain 25/60] loss=1.2203, acc=0.4509


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.67batch/s]


[Pretrain 26/60] loss=1.2169, acc=0.4643


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.68batch/s]


[Pretrain 27/60] loss=1.2135, acc=0.4665


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.54batch/s]


[Pretrain 28/60] loss=1.2112, acc=0.4692


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.64batch/s]


[Pretrain 29/60] loss=1.2049, acc=0.4665


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.78batch/s]


[Pretrain 30/60] loss=1.2002, acc=0.4694


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.83batch/s]


[Pretrain 31/60] loss=1.1969, acc=0.4692


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 15.95batch/s]


[Pretrain 32/60] loss=1.1940, acc=0.4770


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.07batch/s]


[Pretrain 33/60] loss=1.1928, acc=0.4772


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 15.95batch/s]


[Pretrain 34/60] loss=1.1902, acc=0.4752


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.24batch/s]


[Pretrain 35/60] loss=1.1819, acc=0.4824


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.10batch/s]


[Pretrain 36/60] loss=1.1750, acc=0.4933


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.20batch/s]


[Pretrain 37/60] loss=1.1733, acc=0.4873


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 15.93batch/s]


[Pretrain 38/60] loss=1.1778, acc=0.4875


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.06batch/s]


[Pretrain 39/60] loss=1.1753, acc=0.4893


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.16batch/s]


[Pretrain 40/60] loss=1.1680, acc=0.4971


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.08batch/s]


[Pretrain 41/60] loss=1.1600, acc=0.4991


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.22batch/s]


[Pretrain 42/60] loss=1.1543, acc=0.5076


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.11batch/s]


[Pretrain 43/60] loss=1.1598, acc=0.4944


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.03batch/s]


[Pretrain 44/60] loss=1.1544, acc=0.5033


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.16batch/s]


[Pretrain 45/60] loss=1.1502, acc=0.5074


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.12batch/s]


[Pretrain 46/60] loss=1.1465, acc=0.5109


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.14batch/s]


[Pretrain 47/60] loss=1.1502, acc=0.5109


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.14batch/s]


[Pretrain 48/60] loss=1.1327, acc=0.5232


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.08batch/s]


[Pretrain 49/60] loss=1.1415, acc=0.5141


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.07batch/s]


[Pretrain 50/60] loss=1.1325, acc=0.5203


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.13batch/s]


[Pretrain 51/60] loss=1.1346, acc=0.5217


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.17batch/s]


[Pretrain 52/60] loss=1.1290, acc=0.5196


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.10batch/s]


[Pretrain 53/60] loss=1.1270, acc=0.5257


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.11batch/s]


[Pretrain 54/60] loss=1.1213, acc=0.5283


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.18batch/s]


[Pretrain 55/60] loss=1.1202, acc=0.5292


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.10batch/s]


[Pretrain 56/60] loss=1.1162, acc=0.5295


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.24batch/s]


[Pretrain 57/60] loss=1.1202, acc=0.5237


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.15batch/s]


[Pretrain 58/60] loss=1.1118, acc=0.5348


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.19batch/s]


[Pretrain 59/60] loss=1.1048, acc=0.5417


Iterating training graphs: 100%|██████████| 140/140 [00:08<00:00, 16.25batch/s]

[Pretrain 60/60] loss=1.1014, acc=0.5406





Filtered set: 2821/4480 samples kept
Filtered set: 688/1120 samples kept


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.11batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.94batch/s]
Epoch 1/20, Loss: 1.1060, Train Acc: 0.5409, Val Acc: 0.8241, Val Macro-F1: 0.7381, Val Loss: 0.8575


Epoch 1/20 | Train Acc: 0.5409 | Val Acc: 0.8241 | Val Macro-F1: 0.7381 | Val Loss: 0.8575
Best model updated and saved at /home/valerio/Desktop/noisy_labels/hackaton/checkpoints/model_B_best.pth


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.26batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 13.16batch/s]
Epoch 2/20, Loss: 1.0842, Train Acc: 0.5424, Val Acc: 0.8241, Val Macro-F1: 0.7218, Val Loss: 0.8624


Epoch 2/20 | Train Acc: 0.5424 | Val Acc: 0.8241 | Val Macro-F1: 0.7218 | Val Loss: 0.8624


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 14.99batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.83batch/s]
Epoch 3/20, Loss: 1.0623, Train Acc: 0.5533, Val Acc: 0.7427, Val Macro-F1: 0.6125, Val Loss: 0.9858


Epoch 3/20 | Train Acc: 0.5533 | Val Acc: 0.7427 | Val Macro-F1: 0.6125 | Val Loss: 0.9858


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.20batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.55batch/s]
Epoch 4/20, Loss: 1.0460, Train Acc: 0.5580, Val Acc: 0.7515, Val Macro-F1: 0.6194, Val Loss: 0.9698


Epoch 4/20 | Train Acc: 0.5580 | Val Acc: 0.7515 | Val Macro-F1: 0.6194 | Val Loss: 0.9698


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 14.98batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.82batch/s]
Epoch 5/20, Loss: 1.0309, Train Acc: 0.5608, Val Acc: 0.7195, Val Macro-F1: 0.5746, Val Loss: 1.0197


Epoch 5/20 | Train Acc: 0.5608 | Val Acc: 0.7195 | Val Macro-F1: 0.5746 | Val Loss: 1.0197


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.21batch/s]


Checkpoint saved at /home/valerio/Desktop/noisy_labels/hackaton/checkpoints/B/model_B_epoch_6.pth


Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.96batch/s]
Epoch 6/20, Loss: 1.0103, Train Acc: 0.5725, Val Acc: 0.6497, Val Macro-F1: 0.4997, Val Loss: 1.1891


Epoch 6/20 | Train Acc: 0.5725 | Val Acc: 0.6497 | Val Macro-F1: 0.4997 | Val Loss: 1.1891


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.26batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.67batch/s]
Epoch 7/20, Loss: 0.9952, Train Acc: 0.5711, Val Acc: 0.6221, Val Macro-F1: 0.4851, Val Loss: 1.2235


Epoch 7/20 | Train Acc: 0.5711 | Val Acc: 0.6221 | Val Macro-F1: 0.4851 | Val Loss: 1.2235


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.19batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.81batch/s]
Epoch 8/20, Loss: 0.9856, Train Acc: 0.5689, Val Acc: 0.7253, Val Macro-F1: 0.5788, Val Loss: 0.9543


Epoch 8/20 | Train Acc: 0.5689 | Val Acc: 0.7253 | Val Macro-F1: 0.5788 | Val Loss: 0.9543


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.21batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 13.16batch/s]
Epoch 9/20, Loss: 0.9597, Train Acc: 0.5782, Val Acc: 0.6991, Val Macro-F1: 0.5835, Val Loss: 1.0139


Epoch 9/20 | Train Acc: 0.5782 | Val Acc: 0.6991 | Val Macro-F1: 0.5835 | Val Loss: 1.0139


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.33batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 13.05batch/s]
Epoch 10/20, Loss: 0.9524, Train Acc: 0.5842, Val Acc: 0.6948, Val Macro-F1: 0.5802, Val Loss: 1.0035


Epoch 10/20 | Train Acc: 0.5842 | Val Acc: 0.6948 | Val Macro-F1: 0.5802 | Val Loss: 1.0035


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.38batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 13.28batch/s]
Epoch 11/20, Loss: 0.9149, Train Acc: 0.5991, Val Acc: 0.6439, Val Macro-F1: 0.5296, Val Loss: 1.1116


Epoch 11/20 | Train Acc: 0.5991 | Val Acc: 0.6439 | Val Macro-F1: 0.5296 | Val Loss: 1.1116


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.36batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.59batch/s]
Epoch 12/20, Loss: 0.9271, Train Acc: 0.5863, Val Acc: 0.5785, Val Macro-F1: 0.4755, Val Loss: 1.2870


Epoch 12/20 | Train Acc: 0.5863 | Val Acc: 0.5785 | Val Macro-F1: 0.4755 | Val Loss: 1.2870


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.32batch/s]


Checkpoint saved at /home/valerio/Desktop/noisy_labels/hackaton/checkpoints/B/model_B_epoch_13.pth


Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.41batch/s]
Epoch 13/20, Loss: 0.9128, Train Acc: 0.6048, Val Acc: 0.6642, Val Macro-F1: 0.5594, Val Loss: 1.1332


Epoch 13/20 | Train Acc: 0.6048 | Val Acc: 0.6642 | Val Macro-F1: 0.5594 | Val Loss: 1.1332


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.30batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.68batch/s]
Epoch 14/20, Loss: 0.8882, Train Acc: 0.6076, Val Acc: 0.6294, Val Macro-F1: 0.5143, Val Loss: 1.2658


Epoch 14/20 | Train Acc: 0.6076 | Val Acc: 0.6294 | Val Macro-F1: 0.5143 | Val Loss: 1.2658


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.29batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.62batch/s]
Epoch 15/20, Loss: 0.8587, Train Acc: 0.6228, Val Acc: 0.6744, Val Macro-F1: 0.5626, Val Loss: 1.1411


Epoch 15/20 | Train Acc: 0.6228 | Val Acc: 0.6744 | Val Macro-F1: 0.5626 | Val Loss: 1.1411


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.36batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.82batch/s]
Epoch 16/20, Loss: 0.8242, Train Acc: 0.6274, Val Acc: 0.6308, Val Macro-F1: 0.5450, Val Loss: 1.3413


Epoch 16/20 | Train Acc: 0.6274 | Val Acc: 0.6308 | Val Macro-F1: 0.5450 | Val Loss: 1.3413


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.25batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.92batch/s]
Epoch 17/20, Loss: 0.8145, Train Acc: 0.6338, Val Acc: 0.5799, Val Macro-F1: 0.4834, Val Loss: 1.5087


Epoch 17/20 | Train Acc: 0.6338 | Val Acc: 0.5799 | Val Macro-F1: 0.4834 | Val Loss: 1.5087


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.30batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.47batch/s]
Epoch 18/20, Loss: 0.7930, Train Acc: 0.6420, Val Acc: 0.5799, Val Macro-F1: 0.5250, Val Loss: 1.5360


Epoch 18/20 | Train Acc: 0.6420 | Val Acc: 0.5799 | Val Macro-F1: 0.5250 | Val Loss: 1.5360


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.31batch/s]
Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 13.28batch/s]
Epoch 19/20, Loss: 0.7775, Train Acc: 0.6562, Val Acc: 0.5392, Val Macro-F1: 0.4848, Val Loss: 1.5593


Epoch 19/20 | Train Acc: 0.6562 | Val Acc: 0.5392 | Val Macro-F1: 0.4848 | Val Loss: 1.5593


Iterating training graphs: 100%|██████████| 89/89 [00:05<00:00, 15.35batch/s]


Checkpoint saved at /home/valerio/Desktop/noisy_labels/hackaton/checkpoints/B/model_B_epoch_20.pth


Iterating eval graphs: 100%|██████████| 22/22 [00:01<00:00, 12.75batch/s]
Epoch 20/20, Loss: 0.7518, Train Acc: 0.6647, Val Acc: 0.5887, Val Macro-F1: 0.4984, Val Loss: 1.3667


Epoch 20/20 | Train Acc: 0.6647 | Val Acc: 0.5887 | Val Macro-F1: 0.4984 | Val Loss: 1.3667
All metrics plot saved to /home/valerio/Desktop/noisy_labels/hackaton/logs/B/plots_all_1/all_metrics.png


In [22]:
import gc
del train_dataset
del train_loader
del full_dataset
del val_dataset
del val_loader
gc.collect()

19512

In [23]:
test_dataset = GraphDataset(args.test_path, transform=add_zeros)
test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
    

In [24]:
print(f"Checkpoint path: {checkpoint_path}")
model.load_state_dict(torch.load(checkpoint_path, map_location=device))
predictions = evaluate(test_loader, model, device, calculate_accuracy=False)
save_predictions(predictions, args.test_path)

Checkpoint path: /home/valerio/Desktop/noisy_labels/hackaton/checkpoints/model_B_best.pth


Iterating eval graphs: 100%|██████████| 49/49 [00:05<00:00,  9.55batch/s]

Predictions saved to /home/valerio/Desktop/noisy_labels/hackaton/submission/testset_B.csv





In [25]:
# from collections import Counter

# train_paths = ["./datasets/data/A/train.json.gz", 
#               "./datasets/data/B/train.json.gz", 
#               "./datasets/data/C/train.json.gz", 
#               "./datasets/data/D/train.json.gz"]

# test_paths = ["./datasets/data/A/test.json.gz", 
#               "./datasets/data/B/test.json.gz", 
#               "./datasets/data/C/test.json.gz", 
#               "./datasets/data/D/test.json.gz"]

# for train_path, test_path in zip(train_paths, test_paths):

#     args.train_path = train_path
#     args.test_path = test_path

#     model = GNN(gnn_type='gin', num_class=6, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device)

#     test_dir_name = os.path.basename(os.path.dirname(args.test_path))
#     logs_folder = os.path.join(script_dir, "logs", test_dir_name)
#     log_file = os.path.join(logs_folder, "training.log")
#     os.makedirs(os.path.dirname(log_file), exist_ok=True)
#     logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(message)s')
#     logging.getLogger().addHandler(logging.StreamHandler())
#     test_dir_name = os.path.basename(os.path.dirname(args.test_path))
#     logs_folder = os.path.join(script_dir, "logs", test_dir_name)

#     checkpoint_path = os.path.join(script_dir, "checkpoints", f"model_{test_dir_name}_best.pth")
#     checkpoints_folder = os.path.join(script_dir, "checkpoints", test_dir_name)
#     os.makedirs(checkpoints_folder, exist_ok=True)

#     if args.train_path:
#         # Carico l’intero dataset e lo splitto

#         full_dataset = GraphDataset(args.train_path, transform=add_zeros)
#         val_size = int(0.2 * len(full_dataset))
#         train_size = len(full_dataset) - val_size

#         generator = torch.Generator().manual_seed(12)
#         train_dataset, val_dataset = random_split(
#             full_dataset, [train_size, val_size], generator=generator
#         )

#         labels = [d.y.item() for d in train_dataset]
#         label_counts = Counter(labels)
#         total = len(labels)
#         num_classes = len(label_counts) 

#         # weights = torch.tensor([total / (num_classes * label_counts[c]) for c in range(num_classes)], 
#         #                        dtype=torch.float,
#         #                        device=device)

#         # if args.baseline_mode == 1:
#         #     criterion = torch.nn.CrossEntropyLoss(weight=weights)

#         train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4)
#         val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4)

#         num_epochs = 100
#         best_val_accuracy = 0.0   
#         # best_f1 = 0.0
#         # best_epoch = -1
#         # patience = 0
#         # wait = 0

#         train_losses = []
#         train_accuracies = []
#         val_losses = []
#         val_accuracies = []

#         if num_checkpoints > 1:
#             checkpoint_intervals = [int((i + 1) * num_epochs / num_checkpoints) for i in range(num_checkpoints)]
#         else:
#             checkpoint_intervals = [num_epochs]

#         for epoch in range(num_epochs):
#             train_loss, train_acc = train(
#                 train_loader, model, optimizer, criterion, device,
#                 save_checkpoints=(epoch + 1 in checkpoint_intervals),
#                 checkpoint_path=os.path.join(checkpoints_folder, f"model_{test_dir_name}"),
#                 current_epoch=epoch
#             )

#             val_loss, val_acc, val_f1 = evaluate(val_loader, model, device, calculate_accuracy=True)
#             print(
#                 f"Epoch {epoch+1}/{num_epochs} | "
#                 f"Train Acc: {train_acc:.4f} | "
#                 f"Val Acc: {val_acc:.4f} | "
#                 f"Val Macro-F1: {val_f1:.4f} | "
#                 f"Val Loss: {val_loss:.4f}"
#             )
#             logging.info(
#                 f"Epoch {epoch + 1}/{num_epochs}, "
#                 f"Loss: {train_loss:.4f}, "
#                 f"Train Acc: {train_acc:.4f}, "
#                 f"Val Acc: {val_acc:.4f}, "
#                 f"Val Macro-F1: {val_f1:.4f}, "
#                 f"Val Loss: {val_loss:.4f}"
#             )


#             #print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

#             train_losses.append(train_loss)
#             train_accuracies.append(train_acc)
#             val_losses.append(val_loss)
#             val_accuracies.append(val_acc)

#             # if val_f1 > best_f1 + 1e-4:
#             #     best_epoch = epoch + 1
#             #     best_f1 = val_f1
                
#             if val_acc > best_val_accuracy:
#                 best_val_accuracy = val_acc
#                 torch.save(model.state_dict(), checkpoint_path)
#                 print(f"Best model updated and saved at {checkpoint_path}")

#         plot_training_progress(train_losses, train_accuracies, os.path.join(logs_folder, "plots"))
#         plot_training_progress(val_losses, val_accuracies, os.path.join(logs_folder, "plotsVal"))
#         plot_all_metrics(train_losses, train_accuracies, val_losses, val_accuracies, os.path.join(logs_folder, "plots_all_1"))


#     import gc
#     del train_dataset
#     del train_loader
#     del full_dataset
#     del val_dataset
#     del val_loader
#     gc.collect()

#     test_dataset = GraphDataset(args.test_path, transform=add_zeros)
#     test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)

#     model.load_state_dict(torch.load(checkpoint_path, map_location=device))
#     predictions = evaluate(test_loader, model, device, calculate_accuracy=False)
#     save_predictions(predictions, args.test_path)

In [26]:
# import os
# from collections import Counter
# import torch
# from torch.utils.data import random_split
# import numpy as np

# # Imposta root_dir alla cartella che contiene A/, B/, C/, D/
# root_dir = '/home/valerio/Desktop/noisy_labels/hackaton/datasets/data'
# # oppure, se passi --train_path uguale a questo path:
# # root_dir = args.train_path

# log_file = "stats.log"
# # Pulisci il log all’avvio
# with open(log_file, "w") as f:
#     f.write("")

# for subname in sorted(os.listdir(root_dir)):
#     subdir = os.path.join(root_dir, subname)
#     train_path = os.path.join(subdir, "train.json.gz")

#     if not os.path.isdir(subdir) or not os.path.isfile(train_path):
#         print(f"Skipping {subdir}, not a valid dataset directory or missing train.json.gz")
#         continue

#     print(f"Processing dataset {subname}")

#     # Carico e splitto
#     full_dataset = GraphDataset(train_path, transform=add_zeros)
#     val_size = int(0.2 * len(full_dataset))
#     train_size = len(full_dataset) - val_size
#     generator = torch.Generator().manual_seed(12)
#     train_dataset, val_dataset = random_split(
#         full_dataset, [train_size, val_size], generator=generator
#     )

#     # Calcolo distribuzione
#     labels = [d.y.item() for d in train_dataset]
#     counts = Counter(labels)
#     total = len(labels)

#     with open(log_file, "a") as f:
#         f.write(f"=== Dataset {os.path.basename(subdir)} ===\n")
#         for cls in sorted(counts):
#             cnt = counts[cls]
#             pct = cnt / total * 100
#             f.write(f"Label {cls}: {cnt} esempi ({pct:.1f}%)\n")

#         f.write("\n--- Esempi struttura grafo ---\n")
#         for cls in sorted(counts):
#             sample = next(d for d in train_dataset if d.y.item() == cls)
#             f.write(
#                 f"Classe {cls}: num_nodes={sample.num_nodes}, "
#                 f"num_edges={sample.num_edges}\n"
#             )

#         # Statistiche aggregate
#         f.write("\n--- Statistiche aggregate per classe ---\n")
#         f.write("cls\tn_samp\tnodes_mean\tnodes_std\tedges_mean\tedges_std\n")
#         for cls in sorted(counts):
#             samples = [d for d in train_dataset if d.y.item() == cls]
#             nodes = [d.num_nodes for d in samples]
#             edges = [d.num_edges for d in samples]
#             f.write(
#                 f"{cls}\t{counts[cls]}\t"
#                 f"{np.mean(nodes):.1f}\t{np.std(nodes):.1f}\t"
#                 f"{np.mean(edges):.1f}\t{np.std(edges):.1f}\n"
#             )

#         f.write("\n\n")

#     # Una volta processata questa cartella, non serve cercare altri train.json.gz in subdirectory
#     # se sai che i dataset sono solo A–D direttamente sotto root_dir. Altrimenti commenta questa linea.
#     # break  


In [27]:
# import torch.nn.functional as F
# class SCELoss(torch.nn.Module):
#     def __init__(self, num_classes, alpha=0.1, beta=1.0):
#         super().__init__()
#         self.alpha, self.beta = alpha, beta
#         self.num_classes = num_classes

#     def forward(self, logits, targets):
#         #CCE
#         ce  = F.cross_entropy(logits, targets, reduction='none')

#         #RCE
#         pred = F.softmax(logits, dim=1).clamp(min=1e-6, max=1-1e-6)
#         one_hot = F.one_hot(targets, self.num_classes).float()
#         rce = -(1 - one_hot) * torch.log(1 - pred)
#         rce = rce.sum(dim=1)
#         return (self.alpha * ce + self.beta * rce).mean()