In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import random

import pandas as pd

from torch.utils.data import Dataset
import torch.nn.functional as F
import torch.nn as nn

from torch.optim import AdamW

import torch_geometric.transforms as T

from torch_geometric.data import Batch

from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.nn import global_add_pool
from torch_geometric.nn import GraphConv
from torch.utils.data import DataLoader

from pathlib import Path

from tqdm import tqdm


import concurrent.futures

In [3]:
import sys
import os
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
sys.path.append(parent_dir)
from DataPipeline.dataset import ZincPreloadDataset, custom_collate_passive_add_feature
from Model.GNN1 import ModelWithEdgeFeatures
from Model.metrics import pseudo_accuracy_metric, pseudo_recall_for_each_class, pseudo_precision_for_each_class


In [4]:
def create_data_loader(number_reference_dict, dir_path, custom_collate, batch_size, shuffle, num_workers):
    dataset = ZincPreloadDataset(number_reference_dict, dir_path)
    return DataLoader(dataset, batch_size=128, shuffle=True, num_workers=0, collate_fn=custom_collate_passive_add_feature)
    

In [5]:
import numpy as np 
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm as tqdm_notebook

encoding_size = 7

model = ModelWithEdgeFeatures(num_classes = encoding_size, in_channels=encoding_size + 1, hidden_channels_list=[64, 128, 256, 512, 512], mlp_hidden_channels=512, edge_channels=4, use_dropout=False, size_info=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)



from sklearn.utils import class_weight
import numpy as np


# Set up the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=0.0001)

# Set up the loss function for multiclass 
 
criterion = nn.CrossEntropyLoss()


name = 'new_balanced_10-4'

# Training function

from tqdm.notebook import tqdm as tqdm_notebook

def train(loader, epoch):
    model.train()
    total_loss = 0
    mse_sum = 0
    num_correct = 0
    num_correct_recall = torch.zeros(encoding_size)
    num_correct_precision = torch.zeros(encoding_size)
    count_per_class_recall = torch.zeros(encoding_size)
    count_per_class_precision = torch.zeros(encoding_size)
    progress_bar = tqdm_notebook(loader, desc="Training", unit="batch")

    avg_output_vector = np.zeros(encoding_size)  # Initialize the average output vector
    avg_label_vector = np.zeros(encoding_size)  # Initialize the average label vector
    total_graphs_processed = 0

    

    for batch_idx, batch in enumerate(progress_bar):
        data = batch[0]
        terminal_node_infos = batch[1]
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        terminal_node_infos = terminal_node_infos.to(device)

        loss = criterion(out, terminal_node_infos)
        num_correct += pseudo_accuracy_metric(out.detach().cpu(), terminal_node_infos.detach().cpu(), random=True)

        recall_output = pseudo_recall_for_each_class(out.detach().cpu(), terminal_node_infos.detach().cpu(), random=True)
        precision_output = pseudo_precision_for_each_class(out.detach().cpu(), terminal_node_infos.detach().cpu(), random=True)
        num_correct_recall += recall_output[0]
        num_correct_precision += precision_output[0]
        count_per_class_recall += recall_output[1]
        count_per_class_precision += precision_output[1]
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
        loss_value = total_loss / (data.num_graphs * (progress_bar.last_print_n + 1))

        # Compute MSE
        mse = mean_squared_error(terminal_node_infos.detach().cpu(), out.detach().cpu())
        mse_sum += mse * data.num_graphs
        mse_value = mse_sum / (data.num_graphs * (progress_bar.last_print_n + 1))

        # Update the average output vector
        avg_output_vector += out.detach().cpu().numpy().mean(axis=0) * data.num_graphs
        avg_label_vector += terminal_node_infos.detach().cpu().numpy().mean(axis=0) * data.num_graphs
        total_graphs_processed += data.num_graphs
        current_avg_output_vector = avg_output_vector / total_graphs_processed
        current_avg_label_vector = avg_label_vector / total_graphs_processed
        avg_correct = num_correct / total_graphs_processed
        avg_correct_recall = num_correct_recall / count_per_class_recall
        avg_correct_precision = num_correct_precision / count_per_class_precision
        avg_f1 = 2 * (avg_correct_recall * avg_correct_precision) / (avg_correct_recall + avg_correct_precision)
        progress_bar.set_postfix(loss=loss_value, mse=mse_value, avg_output_vector=current_avg_output_vector, 
                                 avg_label_vector=current_avg_label_vector, 
                                 avg_correct=avg_correct, num_correct=num_correct, 
                                 total_graphs_processed=total_graphs_processed, 
                                 avg_correct_precision=avg_correct_precision, 
                                 avg_correct_recall=avg_correct_recall, 
                                 avg_f1=avg_f1,
                                 count_per_class_precision=count_per_class_precision,
                                 count_per_class_recall=count_per_class_recall)


    return total_loss / len(loader.dataset), current_avg_label_vector, current_avg_output_vector, avg_correct

# Train the model

# Create a dataframe to save the training history
training_history = pd.DataFrame(columns=['epoch', 'loss', 'mse', 'avg_output_vector', 'avg_label_vector'])


n_epochs = 100
dir_path = Path('..') / 'DataPipeline' / 'data' / 'prepared_new_dataset'

number_reference_dict = {'C' : '0', 'N' : '0', 'O' : '0', 'S' : '0', 'F' : '0', 'Cl' : '0', 'stop' : '0'}
max_reference_dict = {'C' : 300, 'N' : 100, 'O' : 100, 'S' : 18, 'F' : 15, 'Cl' : 8, 'stop' : 500}


for epoch in range(1, n_epochs+1):

   
    dataset = ZincPreloadDataset(number_reference_dict=number_reference_dict, data_dir=dir_path)
    loader = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=custom_collate_passive_add_feature)

    # +1 in number_reference_dict
    
    for key, value in number_reference_dict.items():
        number_reference_dict[key] = str((int(value) + 1)%max_reference_dict[key])

    loss, avg_label_vector, avg_output_vector, avg_correct = train(loader, epoch)
    training_history = training_history.append({'epoch': epoch, 'loss': loss, 'mse': mean_squared_error(avg_label_vector, avg_output_vector), 'avg_output_vector': avg_output_vector, 'avg_label_vector': avg_label_vector, 'avg_correct': avg_correct}, ignore_index=True)
    #save the model(all with optimizer step, the loss ) every 5 epochs

    save_every_n_epochs = 5
    if (epoch) % save_every_n_epochs == 0:
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            # Add any other relevant information you want to save here
        }
        torch.save(checkpoint, f'checkpoint_epoch_{epoch+1}_{name}.pt')
        
    #save the training history every encoding_size epochs
    if epoch % 1 == 0:
        training_history.to_csv(f"training_history_{name}.csv", index=False)
    print(f'Epoch: {epoch}, Loss: {loss:.8f}')

  0%|          | 0/7 [00:00<?, ?it/s]

loader the batch named ..\DataPipeline\data\prepared_new_dataset\C\C100000_0.zst


 14%|█▍        | 1/7 [00:26<02:37, 26.18s/it]

loader the batch named ..\DataPipeline\data\prepared_new_dataset\N\N100000_0.zst


 29%|██▊       | 2/7 [00:50<02:04, 24.94s/it]

loader the batch named ..\DataPipeline\data\prepared_new_dataset\O\O100000_0.zst


 29%|██▊       | 2/7 [01:10<02:56, 35.23s/it]


KeyboardInterrupt: 