In [1]:
 !pip install torch_geometric


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
"""from google.colab import drive
drive.mount('/content/drive')"""

"from google.colab import drive\ndrive.mount('/content/drive')"

In [3]:
"""%cd /content/drive/MyDrive/DeepLearning/hackaton"""

'%cd /content/drive/MyDrive/DeepLearning/hackaton'

In [4]:
# Imports
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
# PyG
from torch_geometric.loader import DataLoader
from sklearn.model_selection import StratifiedShuffleSplit
from torch.utils.data import Subset

# Vostri moduli
from src.loadData    import GraphDataset
from src.models      import GNN
from src.transforms import EdgeDropout, NodeDropout, Compose, GraphMixUp
from src.losses import (
    GeneralizedCELoss,
    SymmetricCELoss,
    estimate_transition_matrix,
    ForwardCorrectionLoss,
    BootstrappingLoss
)
from src.pretraining import GraphCLTrainer, add_zeros
from src.divide_mix_def import DivideMixTrainer

# Fissa seed
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
np.random.seed(42)
random.seed(42)



In [5]:
def add_zeros(data):
    if not hasattr(data, 'x') or data.x is None:
        data.x = torch.zeros(data.num_nodes, dtype=torch.long)
    return data

In [6]:
def train(data_loader, model, optimizer, criterion, device, save_checkpoints, checkpoint_path, current_epoch):

    model = model.to(device)
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for data in tqdm(data_loader, desc="Iterating training graphs", unit="batch"):
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        pred = output.argmax(dim=1)
        correct += (pred == data.y).sum().item()
        total += data.y.size(0)

    # Save checkpoints if required
    if save_checkpoints:
        checkpoint_file = f"{checkpoint_path}_epoch_{current_epoch + 1}.pth"
        torch.save(model.state_dict(), checkpoint_file)
        print(f"Checkpoint saved at {checkpoint_file}")

    print(f"Train loss/acc: {total_loss / len(data_loader):.4f}/{correct / total:.4f}")
    return total_loss / len(data_loader),  correct / total

In [7]:
def evaluate(data_loader, model, device, calculate_accuracy=False):
    model = model.to(device)
    model.eval()
    correct = 0
    total = 0
    predictions = []
    total_loss = 0
    criterion = torch.nn.CrossEntropyLoss()
    with torch.no_grad():
        for data in tqdm(data_loader, desc="Iterating eval graphs", unit="batch"):
            data = data.to(device)
            output = model(data)
            pred = output.argmax(dim=1)

            if calculate_accuracy:
                correct += (pred == data.y).sum().item()
                total += data.y.size(0)
                total_loss += criterion(output, data.y).item()
            else:
                predictions.extend(pred.cpu().numpy())
    if calculate_accuracy:
        accuracy = correct / total
        return  total_loss / len(data_loader),accuracy
    return predictions

In [8]:
def save_predictions(predictions, test_path):
    script_dir = os.getcwd()
    submission_folder = os.path.join(script_dir, "submission")
    test_dir_name = os.path.basename(os.path.dirname(test_path))

    os.makedirs(submission_folder, exist_ok=True)

    output_csv_path = os.path.join(submission_folder, f"testset_{test_dir_name}.csv")

    test_graph_ids = list(range(len(predictions)))
    output_df = pd.DataFrame({
        "id": test_graph_ids,
        "pred": predictions
    })

    output_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")

In [9]:
def plot_training_progress(train_losses, train_accuracies, save_plot, output_dir):
    epochs = range(1, len(train_losses) + 1)
    plt.figure(figsize=(12, 6))

    # Plot loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label="Training Loss", color='blue')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss per Epoch')

    # Plot accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, label="Training Accuracy", color='green')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Training Accuracy per Epoch')

    if(save_plot):
        # Save plots in the current directory
        os.makedirs(output_dir, exist_ok=True)
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, "training_progress.png"))
        plt.close()

In [None]:
# Hyper-parameters
device                =  "cuda" if torch.cuda.is_available() else "cpu"

# Dataset & I/O
num_classes           = 6
batch_size            = 32

# Modello
gnn_type              = 'gin-virtual'    # scegli fra 'gin','gin-virtual','gcn','gcn-virtual'
num_layer             = 5
emb_dim               = 256
drop_ratio            = 0.4
lr                    = 5e-4
weight_decay          = 5e-4
epochs                = 50

num_epochs_initial = 5
num_epochs_remaining = 50
tau = 0.5
patience = 10
best_val_acc = 0.0
trigger_times = 0


# Trasform
edge_p  = 0.2   # EdgeDropout
node_p  = 0.2   # NodeDropout

transforms = Compose([
    EdgeDropout(p=edge_p),
    add_zeros,
])

# Co-Teaching

In [11]:
def create_gnn_model(gnn_type, num_classes, num_layer, emb_dim, drop_ratio, device):
    kwargs = {
        'gnn_type': gnn_type.replace("-virtual", ""),
        'num_class': num_classes,
        'num_layer': num_layer,
        'emb_dim': emb_dim,
        'drop_ratio': drop_ratio,
        'virtual_node': "virtual" in gnn_type,
        'residual': True
        }

    model = GNN(**kwargs).to(device)
    return model


def create_optimizer_and_scheduler(model, lr=1e-3, weight_decay=1e-4, t_max=50, eta_min=1e-5):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=t_max, eta_min=eta_min)
    return optimizer, scheduler


In [12]:
from copy import deepcopy
import torch

class CurriculumTrainer:
    def __init__(self, model, train_dataset, val_dataset, criterion, 
                 optimizer, scheduler, device, num_classes):
        self.model = model
        self.full_train_dataset = train_dataset  # Keep original dataset
        self.val_dataset = val_dataset
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = device
        self.num_classes = num_classes
        self.best_val_acc = 0
        self.best_model = None
        
    def _create_filtered_loader(self, model, loader, keep_frac=0.85):
        """Create new DataLoader with filtered samples"""
        model.eval()
        indices = []
        losses = []
        
        with torch.no_grad():
            for batch in loader:
                batch = batch.to(self.device)
                outputs = model(batch)
                loss = self.criterion(outputs, batch.y, reduction='none')
                losses.append(loss.cpu())
                indices.extend(batch.idx.tolist())  # Assuming each graph has an idx
        
        losses = torch.cat(losses)
        threshold = losses.quantile(keep_frac)
        clean_indices = [idx for i, idx in enumerate(indices) if losses[i] <= threshold]
        
        return DataLoader(
            Subset(self.full_train_dataset, clean_indices),
            batch_size=loader.batch_size,
            shuffle=True,
            num_workers=loader.num_workers
        )

    def train(self, num_epochs=100, patience=10, 
              curriculum_epoch=5, forward_corr_epoch=10):
        
        # Initial loader uses full dataset
        current_train_loader = DataLoader(
            self.full_train_dataset,
            batch_size=32,
            shuffle=True,
            num_workers=4
        )
        
        for epoch in range(num_epochs):
            self.model.train()
            total_loss, total_correct, total_samples = 0, 0, 0

            for batch in tqdm(current_train_loader, desc=f"Epoch {epoch+1}"):
                batch = batch.to(self.device)
                
                # Forward pass
                outputs = self.model(batch)
                loss = self.criterion(outputs, batch.y)
                
                # Backward pass
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
                # Metrics
                total_loss += loss.item() * batch.y.size(0)
                total_correct += (outputs.argmax(1) == batch.y).sum().item()
                total_samples += batch.y.size(0)

            # Validation
            val_acc = self.validate()
            avg_loss = total_loss / total_samples
            train_acc = total_correct / total_samples
            
            print(f"Epoch {epoch+1}: Loss={avg_loss:.4f}, Train Acc={train_acc:.4f}, Val Acc={val_acc:.4f}")

            # Early stopping check
            if val_acc > self.best_val_acc:
                self.best_val_acc = val_acc
                self.best_model = deepcopy(self.model.state_dict())
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered!")
                    break

            # Curriculum learning (create new loader)
            if epoch == curriculum_epoch:
                print("Activating curriculum learning...")
                current_train_loader = self._create_filtered_loader(
                    self.model, current_train_loader
                )

            # Forward correction
            if epoch == forward_corr_epoch:
                print("Activating forward correction...")
                self.criterion = self._estimate_forward_correction(current_train_loader)

            self.scheduler.step()

        # Load best model
        self.model.load_state_dict(self.best_model)
        return self.model

    def _estimate_forward_correction(self, loader):
        """Estimate transition matrix and return new loss"""
        self.model.eval()
        conf_matrix = torch.zeros(self.num_classes, self.num_classes)
        
        with torch.no_grad():
            for batch in loader:
                batch = batch.to(self.device)
                outputs = self.model(batch)
                preds = outputs.argmax(1)
                for p, y in zip(preds, batch.y):
                    conf_matrix[y, p] += 1

        # Normalize and smooth
        T = conf_matrix / (conf_matrix.sum(1, keepdim=True) + 1e-6)
        T = (T + torch.eye(self.num_classes, device=T.device)*0.1) / 1.1  # Smoothing
        
        return ForwardCorrectionLoss(T).to(self.device)

    def validate(self):
        self.model.eval()
        val_loader = DataLoader(self.val_dataset, batch_size=32, shuffle=False)
        correct, total = 0, 0
        
        with torch.no_grad():
            for batch in val_loader:
                batch = batch.to(self.device)
                outputs = self.model(batch)
                preds = outputs.argmax(1)
                correct += (preds == batch.y).sum().item()
                total += batch.y.size(0)
                
        return correct / total


In [13]:
def split_dataset(dataset: GraphDataset, val_ratio=0.1, seed=42):
    labels = torch.tensor([data.y.item() for data in dataset])
    indices = list(range(len(dataset)))

    train_idx, val_idx = train_test_split(
        indices,
        test_size=val_ratio,
        stratify=labels,
        random_state=seed
    )

    train_subset = []
    val_subset = []

    for new_idx, original_idx in enumerate(train_idx):
        data = dataset[original_idx]
        data.idx = new_idx  # Normalize index
        train_subset.append(data)

    for new_idx, original_idx in enumerate(val_idx):
        data = dataset[original_idx]
        data.idx = new_idx  # Normalize index
        val_subset.append(data)

    return train_subset, val_subset


In [14]:
import gc 
# Modifica loop principale di training
train_datasets_path = ["datasets/D/train.json.gz"]

for ds in train_datasets_path:


    print("Generating dataset")
    try:
        full_dataset = GraphDataset(ds, transform=transforms).shuffle()
    except:
        del full_dataset
        gc.collect()
    
    print("Splitting dataset")
    train_set, val_set = split_dataset(full_dataset, 0.2, 42)

    model = create_gnn_model("gin-virtual", num_classes=num_classes, num_layer=num_layer, emb_dim=emb_dim, drop_ratio=drop_ratio, device=device)
    optimizer, scheduler = create_optimizer_and_scheduler(model, lr=lr, weight_decay=weight_decay)

    criterion = SymmetricCELoss(alpha=0.4, beta=0.6).to(device)
    
    trainer = CurriculumTrainer(
        model=model,
        train_dataset=train_set,  
        val_dataset=val_set,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,
        num_classes=6
    )

    # Train
    best_model = trainer.train(
        num_epochs=100,
        patience=20,
        curriculum_epoch=8,
        forward_corr_epoch=15
    )

    del val_set, train_set, model, trainer, full_dataset
    gc.collect()

    # Test e predizioni
    test_loader = DataLoader(GraphDataset(ds.replace("train", "test"), transform=transforms), batch_size=32)
    predictions = evaluate(test_loader, best_model, device, False)
    save_predictions(predictions=predictions, test_path=ds.replace("train", "test"))

    del test_loader, criterion, optimizer, scheduler, best_model, predictions
    gc.collect()
    torch.cuda.empty_cache()

Generating dataset
Splitting dataset


Epoch 1: 100%|██████████| 257/257 [00:08<00:00, 30.15it/s]


Epoch 1: Loss=9.2879, Train Acc=0.3000, Val Acc=0.3487


Epoch 2: 100%|██████████| 257/257 [00:08<00:00, 30.61it/s]


Epoch 2: Loss=8.4548, Train Acc=0.3684, Val Acc=0.3750


Epoch 3: 100%|██████████| 257/257 [00:08<00:00, 30.81it/s]


Epoch 3: Loss=8.0364, Train Acc=0.3940, Val Acc=0.3755


Epoch 4: 100%|██████████| 257/257 [00:08<00:00, 30.82it/s]


Epoch 4: Loss=7.6564, Train Acc=0.4190, Val Acc=0.4008


Epoch 5: 100%|██████████| 257/257 [00:08<00:00, 31.40it/s]


Epoch 5: Loss=7.3039, Train Acc=0.4475, Val Acc=0.4694


Epoch 6: 100%|██████████| 257/257 [00:08<00:00, 30.71it/s]


Epoch 6: Loss=7.0688, Train Acc=0.4615, Val Acc=0.2738


Epoch 7: 100%|██████████| 257/257 [00:08<00:00, 30.64it/s]


Epoch 7: Loss=6.8667, Train Acc=0.4785, Val Acc=0.4723


Epoch 8: 100%|██████████| 257/257 [00:08<00:00, 30.71it/s]


Epoch 8: Loss=6.6115, Train Acc=0.4977, Val Acc=0.3536


Epoch 9: 100%|██████████| 257/257 [00:08<00:00, 30.84it/s]


Epoch 9: Loss=6.4873, Train Acc=0.5039, Val Acc=0.4027
Activating curriculum learning...


Epoch 10: 100%|██████████| 219/219 [00:07<00:00, 31.24it/s]


Epoch 10: Loss=6.4127, Train Acc=0.5004, Val Acc=0.4105


Epoch 11: 100%|██████████| 219/219 [00:07<00:00, 31.22it/s]


Epoch 11: Loss=6.2984, Train Acc=0.5149, Val Acc=0.4737


Epoch 12: 100%|██████████| 219/219 [00:07<00:00, 31.17it/s]


Epoch 12: Loss=6.1505, Train Acc=0.5240, Val Acc=0.3658


Epoch 13: 100%|██████████| 219/219 [00:07<00:00, 29.67it/s]


Epoch 13: Loss=6.1227, Train Acc=0.5243, Val Acc=0.4611


Epoch 14: 100%|██████████| 219/219 [00:07<00:00, 30.20it/s]


Epoch 14: Loss=5.9858, Train Acc=0.5372, Val Acc=0.5545


Epoch 15: 100%|██████████| 219/219 [00:07<00:00, 29.55it/s]


Epoch 15: Loss=5.9783, Train Acc=0.5371, Val Acc=0.5676


Epoch 16: 100%|██████████| 219/219 [00:06<00:00, 31.42it/s]


Epoch 16: Loss=5.7578, Train Acc=0.5542, Val Acc=0.3463
Activating forward correction...


Epoch 17: 100%|██████████| 219/219 [00:07<00:00, 30.08it/s]


Epoch 17: Loss=1.6695, Train Acc=0.4700, Val Acc=0.3502


Epoch 18: 100%|██████████| 219/219 [00:07<00:00, 30.83it/s]


Epoch 18: Loss=1.6078, Train Acc=0.4601, Val Acc=0.4523


Epoch 19: 100%|██████████| 219/219 [00:07<00:00, 30.49it/s]


Epoch 19: Loss=1.5884, Train Acc=0.4805, Val Acc=0.4859


Epoch 20: 100%|██████████| 219/219 [00:07<00:00, 30.50it/s]


Epoch 20: Loss=1.6026, Train Acc=0.4629, Val Acc=0.4509


Epoch 21: 100%|██████████| 219/219 [00:07<00:00, 31.13it/s]


Epoch 21: Loss=1.5833, Train Acc=0.4738, Val Acc=0.5389


Epoch 22: 100%|██████████| 219/219 [00:07<00:00, 30.79it/s]


Epoch 22: Loss=1.5756, Train Acc=0.4795, Val Acc=0.4139


Epoch 23: 100%|██████████| 219/219 [00:07<00:00, 30.51it/s]


Epoch 23: Loss=1.5812, Train Acc=0.4715, Val Acc=0.4912


Epoch 24: 100%|██████████| 219/219 [00:07<00:00, 30.48it/s]


Epoch 24: Loss=1.5793, Train Acc=0.4734, Val Acc=0.4236


Epoch 25: 100%|██████████| 219/219 [00:06<00:00, 31.72it/s]


Epoch 25: Loss=1.5821, Train Acc=0.4712, Val Acc=0.4640


Epoch 26: 100%|██████████| 219/219 [00:07<00:00, 30.51it/s]


Epoch 26: Loss=1.5665, Train Acc=0.4790, Val Acc=0.5131


Epoch 27: 100%|██████████| 219/219 [00:07<00:00, 30.45it/s]


Epoch 27: Loss=1.5681, Train Acc=0.4735, Val Acc=0.3624


Epoch 28: 100%|██████████| 219/219 [00:07<00:00, 30.87it/s]


Epoch 28: Loss=1.5678, Train Acc=0.4757, Val Acc=0.4689


Epoch 29: 100%|██████████| 219/219 [00:07<00:00, 30.99it/s]


Epoch 29: Loss=1.5748, Train Acc=0.4765, Val Acc=0.5063


Epoch 30: 100%|██████████| 219/219 [00:06<00:00, 31.81it/s]


Epoch 30: Loss=1.5600, Train Acc=0.4824, Val Acc=0.5151


Epoch 31: 100%|██████████| 219/219 [00:06<00:00, 31.30it/s]


Epoch 31: Loss=1.5647, Train Acc=0.4794, Val Acc=0.5000


Epoch 32: 100%|██████████| 219/219 [00:07<00:00, 30.28it/s]


Epoch 32: Loss=1.5538, Train Acc=0.4838, Val Acc=0.5457


Epoch 33: 100%|██████████| 219/219 [00:07<00:00, 31.07it/s]


Epoch 33: Loss=1.5518, Train Acc=0.4910, Val Acc=0.4567


Epoch 34: 100%|██████████| 219/219 [00:07<00:00, 29.59it/s]


Epoch 34: Loss=1.5538, Train Acc=0.4861, Val Acc=0.5302


Epoch 35: 100%|██████████| 219/219 [00:07<00:00, 30.86it/s]


Epoch 35: Loss=1.5525, Train Acc=0.4835, Val Acc=0.4981
Early stopping triggered!


Iterating eval graphs: 100%|██████████| 71/71 [00:05<00:00, 12.40batch/s]


Predictions saved to /home/flaviolinux/uni/deep_learning/hackaton/submission/testset_D.csv


In [15]:
torch.cuda.empty_cache()
gc.collect()

0