In [1]:
import os
import torch
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt
import logging
from tqdm import tqdm
from torch_geometric.loader import DataLoader
from torch.utils.data import random_split
from functools import partial
import optuna
import gc
from typing import Literal
import torch.nn.functional as F

# Load utility functions from cloned repository
from src.loadData import GraphDataset
from src.utils import set_seed
from src.models import GNN


# Set the random seed
set_seed()


In [2]:
def add_zeros(data):
    data.x = torch.zeros(data.num_nodes, dtype=torch.long)
    return data

In [3]:
def train(
    data_loader,
    model,
    optimizer,
    criterion,
    device,
    save_checkpoints,
    checkpoint_path,
    current_epoch,
):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for data in data_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        pred = output.argmax(dim=1)
        correct += (pred == data.y).sum().item()
        total += data.y.size(0)

    # Save checkpoints if required
    if save_checkpoints:
        checkpoint_file = f"{checkpoint_path}_epoch_{current_epoch + 1}.pth"
        torch.save(model.state_dict(), checkpoint_file)
        print(f"Checkpoint saved at {checkpoint_file}")

    return total_loss / len(data_loader), correct / total

In [4]:
def evaluate(data_loader, model, device, calculate_accuracy=False):
    model.eval()
    correct = 0
    total = 0
    predictions = []
    total_loss = 0
    criterion = torch.nn.CrossEntropyLoss()
    with torch.no_grad():
        for data in data_loader:
            data = data.to(device)
            output = model(data)
            pred = output.argmax(dim=1)

            if calculate_accuracy:
                correct += (pred == data.y).sum().item()
                total += data.y.size(0)
                total_loss += criterion(output, data.y).item()
            else:
                predictions.extend(pred.cpu().numpy())
    if calculate_accuracy:
        accuracy = correct / total
        return total_loss / len(data_loader), accuracy
    return predictions

In [5]:
def save_predictions(predictions, test_path):
    script_dir = os.getcwd()
    submission_folder = os.path.join(script_dir, "submission")
    test_dir_name = os.path.basename(os.path.dirname(test_path))

    os.makedirs(submission_folder, exist_ok=True)

    output_csv_path = os.path.join(submission_folder, f"testset_{test_dir_name}.csv")

    test_graph_ids = list(range(len(predictions)))
    output_df = pd.DataFrame({"id": test_graph_ids, "pred": predictions})

    output_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")

In [6]:
def plot_training_progress(train_losses, train_accuracies, output_dir):
    epochs = range(1, len(train_losses) + 1)
    plt.figure(figsize=(12, 6))

    # Plot loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label="Training Loss", color="blue")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training Loss per Epoch")

    # Plot accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, label="Training Accuracy", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Training Accuracy per Epoch")

    # Save plots in the current directory
    os.makedirs(output_dir, exist_ok=True)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "training_progress.png"))
    plt.close()

In [7]:
import gzip
import json
from collections import defaultdict

def collect_label_votes():
    label_votes = defaultdict(list)
    dataset_names = ["A","B","C","D"]
    for dataset_name in dataset_names:
        filename = f"./datasets/{dataset_name}/train.json.gz"
        print(f"Reading from: {filename}")
        with gzip.open(filename, "rt", encoding="utf-8") as f:
            graphs = json.load(f)
            for i, graph in enumerate(graphs):
                y = graph["y"][0]
                label_votes[i].append(y)  # assumes same ordering across datasets

    return label_votes

In [8]:
label_votes = collect_label_votes()
label_votes = {k: [v[0] for v in vs] for k,vs in label_votes.items()}
label_votes

Reading from: ./datasets/A/train.json.gz
Reading from: ./datasets/B/train.json.gz
Reading from: ./datasets/C/train.json.gz
Reading from: ./datasets/D/train.json.gz


{0: [4, 3, 1, 2],
 1: [3, 1, 2, 4],
 2: [0, 1, 1, 2],
 3: [4, 3, 3, 0],
 4: [1, 2, 3, 0],
 5: [3, 4, 1, 4],
 6: [0, 1, 2, 3],
 7: [0, 3, 1, 2],
 8: [2, 2, 2, 3],
 9: [1, 2, 0, 3],
 10: [4, 2, 3, 4],
 11: [2, 1, 2, 4],
 12: [2, 3, 1, 2],
 13: [4, 3, 3, 3],
 14: [2, 4, 2, 0],
 15: [2, 4, 5, 2],
 16: [3, 3, 5, 4],
 17: [1, 5, 2, 0],
 18: [4, 5, 3, 4],
 19: [2, 4, 2, 4],
 20: [5, 1, 4, 4],
 21: [2, 3, 2, 4],
 22: [1, 4, 2, 1],
 23: [5, 3, 3, 0],
 24: [0, 3, 2, 2],
 25: [3, 0, 2, 2],
 26: [2, 1, 1, 3],
 27: [2, 1, 4, 4],
 28: [2, 4, 4, 3],
 29: [2, 4, 2, 2],
 30: [4, 3, 2, 4],
 31: [2, 0, 0, 4],
 32: [4, 0, 2, 0],
 33: [2, 4, 3, 2],
 34: [1, 4, 1, 0],
 35: [3, 1, 4, 4],
 36: [2, 1, 4, 4],
 37: [1, 1, 4, 3],
 38: [3, 5, 4, 4],
 39: [3, 1, 2, 0],
 40: [2, 4, 4, 2],
 41: [5, 5, 3, 2],
 42: [4, 3, 1, 0],
 43: [0, 0, 1, 0],
 44: [4, 5, 1, 1],
 45: [2, 0, 5, 4],
 46: [0, 2, 3, 2],
 47: [2, 5, 5, 2],
 48: [4, 3, 1, 4],
 49: [1, 0, 5, 3],
 50: [1, 4, 1, 2],
 51: [3, 4, 0, 3],
 52: [3, 0, 2, 2],
 53

In [9]:
from collections import Counter

def compute_consensus_labels(label_votes):
    consensus_labels = {}
    label_confidence = {}

    for graph_id, votes in label_votes.items():
        vote_counts = Counter(votes)
        top_label, top_count = vote_counts.most_common(1)[0]
        consensus_labels[graph_id] = top_label
        label_confidence[graph_id] = top_count / len(votes)

    return consensus_labels, label_confidence


In [10]:
consensus_labels, label_confidence= compute_consensus_labels(label_votes)


In [None]:
import gzip
import json
from typing import Dict, List

def clean_dataset(version_number: int,
                  dataset_parts: List[str],
                  consensus_labels: Dict[int, int],
                  label_confidence: Dict[int, float],
                  confidence_threshold: float = 0.75):
    
    output_filename = f"./datasets/filtered_aggregated/filtered_aggregated_{version_number}.json.gz"
    os.makedirs(os.path.dirname(output_filename))
    all_clean_graphs = []
    global_idx = 0

    for part_path in dataset_parts:
        print(f"Processing: {part_path}")
        with gzip.open(part_path, "rt", encoding="utf-8") as f:
            graphs = json.load(f)

        for graph in graphs:
            conf = label_confidence.get(global_idx, 0.0)
            if conf >= confidence_threshold:
                graph["y"] = [consensus_labels[global_idx]]
                graph["confidence"] = conf
                all_clean_graphs.append(graph)
            global_idx += 1

    print(f"Final dataset size: {len(all_clean_graphs)} graphs.")
    print(f"Writing to: {output_filename}")
    with gzip.open(output_filename, "wt", encoding="utf-8") as f:
        json.dump(all_clean_graphs, f)
    print("🎉 Done.")


In [16]:
clean_dataset(
    version_number=1,
    dataset_parts=[
        "./datasets/A/train.json.gz",
        "./datasets/B/train.json.gz",
        "./datasets/C/train.json.gz",
        "./datasets/D/train.json.gz"
    ],
    consensus_labels=consensus_labels,
    label_confidence=label_confidence,
    confidence_threshold=0.75
)


Processing: ./datasets/A/train.json.gz
Processing: ./datasets/B/train.json.gz
Processing: ./datasets/C/train.json.gz
Processing: ./datasets/D/train.json.gz
Final dataset size: 2149 graphs.
Writing to: ./datasets/unfiltered_aggregated/unfiltered_aggregated_1.json.gz
🎉 Done.
