In [1]:
%load_ext autoreload
%autoreload 2

In [30]:
import torch
import random

import pandas as pd

from torch.utils.data import Dataset
import torch.nn.functional as F
import torch.nn as nn

from torch.optim import AdamW

import torch_geometric.transforms as T

from torch_geometric.data import Batch

from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.nn import global_add_pool
from torch_geometric.nn import GraphConv
from torch.utils.data import DataLoader

from pathlib import Path

from tqdm import tqdm

In [31]:
import sys
import os
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
parent_parent_dir = os.path.dirname(parent_dir)

sys.path.append(parent_dir)
sys.path.append(parent_parent_dir)

from DataPipeline.dataset import ZincSubgraphDatasetStep, custom_collate_GNN3
from Model.GNN3 import ModelWithEdgeFeatures
from Model.metrics import pseudo_accuracy_metric, pseudo_recall_for_each_class, pseudo_precision_for_each_class, MaskedCrossEntropyLoss

In [32]:
datapath = Path('..') / '../DataPipeline/data/preprocessed_graph_no_I_Br_P.pt'
dataset = ZincSubgraphDatasetStep(data_path = datapath, GNN_type=3)

Dataset encoded with size 7


In [33]:
loader = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=custom_collate_GNN3)

In [34]:
encoding_size = 7

model = ModelWithEdgeFeatures(in_channels=encoding_size, hidden_channels_list=[64, 128, 128, 64, 32, 5], edge_channels=4, use_dropout=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


optimizer = AdamW(model.parameters(), lr=0.0001)

# criterion = MaskedCrossEntropyLoss() pas besoin y a plus simple
criterion = nn.CrossEntropyLoss()

name = 'GNN3'



In [37]:
from tqdm.notebook import tqdm as tqdm_notebook
import numpy as np

def train(loader, epoch):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm_notebook(loader, desc="Training", unit="batch")

    num_output = torch.zeros(5)  # Already on CPU
    num_labels = torch.zeros(5)  # Already on CPU
    total_graphs_processed = 0
    
    for batch_idx, batch in enumerate(progress_bar):
        data = batch[0].to(device)
        node_labels = batch[1].to(device)
        mask = batch[2].to(device)
        
        optimizer.zero_grad()
        out = model(data)

        # Convert node_labels to class indices
        
        node_labels = node_labels.to(device)
        mask = mask.to(device)
        

        # Use node_labels_indices with CrossEntropyLoss
        #loss = criterion(out, node_labels, mask)
        loss = criterion(out[mask], node_labels[mask])

        print(out[mask].shape)
        print(data)
        break
        # Add softmax to out
        softmax_out = F.softmax(out, dim=1)
        
        # Calculate metrics and move tensors to CPU
        num_output += torch.sum(softmax_out[mask], dim=0).detach().cpu()
        num_labels += torch.sum(node_labels[mask], dim=0).detach().cpu()
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
        loss_value = total_loss / (data.num_graphs * (progress_bar.last_print_n + 1))
        total_graphs_processed += data.num_graphs
        
        progress_bar.set_postfix(loss=loss_value, avg_num_output=num_output / total_graphs_processed, avg_num_labels=num_labels / total_graphs_processed)

    return (
        total_loss / len(loader.dataset),
        num_output / total_graphs_processed,
        num_labels / total_graphs_processed,
    )

In [38]:
# Create a dataframe to save the training history
training_history = pd.DataFrame(columns=['epoch', 'loss', 'avg_output_vector', 'avg_label_vector'])

n_epochs = 1000
for epoch in range(1, n_epochs+1):
    loss, avg_output_vector, avg_label_vector = train(loader, epoch)
    training_history = training_history.append({'epoch': epoch, 'loss': loss, 'avg_output_vector': avg_output_vector, 'avg_label_vector': avg_label_vector}, ignore_index=True)
    #save the model(all with optimizer step, the loss ) every 5 epochs

    save_every_n_epochs = 5
    if (epoch) % save_every_n_epochs == 0:
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            # Add any other relevant information you want to save here
        }
        torch.save(checkpoint, f'checkpoint_epoch_{epoch+1}_{name}.pt')
        
    #save the training history every 10 epochs
    if epoch % 1 == 0:
        training_history.to_csv(f"training_history_{name}.csv", index=False)
    print(f'Epoch: {epoch}, Loss: {loss:.8f}')

Training:   0%|          | 0/1845 [00:00<?, ?batch/s]

torch.Size([320, 5])
DataBatch(x=[1608, 7], edge_index=[2, 3208], edge_attr=[3208, 4], y=[896], cycle_label=[1608, 5], mask=[1608], terminal_node_info=[128], batch=[1608], ptr=[129])


AttributeError: 'DataFrame' object has no attribute 'append'