In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv
import random
import numpy as np

import os
import pandas as pd
import sys
import torch

sys.path.append('/home/mrahma56/cs519/SSL_LLM_Node_Classification')
from TAGLAS import get_dataset



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from src.models import *

In [3]:
# Set random seed
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False





In [4]:
# SEED = 1234
# SEED = 4567
SEED = 7890
set_seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DATASET = 'cora'
# DATASET = 'wikics'
LLM_ID = "Llama-3"

In [5]:
dataset_key_dict = {
    'cora': 'cora_node',
    'wikics': 'wikics'
}


root_dir = "/home/mrahma56/cs519/SSL_LLM_Node_Classification/"
taglas_dir = root_dir + "TAGLAS/"
llm_gen_dir = root_dir + "llm_gen_data/"
saved_model_dir = root_dir + "saved_models/"
saved_embedding_dir = root_dir + "saved_embeddings/"
embedding_model = "nvidia/NV-Embed-v2"
embedding_path = saved_embedding_dir + f"{dataset_key_dict[DATASET]}_{embedding_model.split('/')[-1]}.pt"
print(embedding_path)

/home/mrahma56/cs519/SSL_LLM_Node_Classification/saved_embeddings/cora_node_NV-Embed-v2.pt


In [6]:
def load_taglas_dataset(dataset_key="cora_node", unlabel_ratio=None,embedding_path=None, print_info=True):
    # Load the dataset from TAGLAS
    dataset = get_dataset(dataset_key, root=taglas_dir)
    data = dataset._data

    # Set train, validation, and test masks based on the dataset key
    if dataset_key == "cora_node":
        data.train_lb_mask = dataset.side_data['node_split']['train'][0].clone()
        data.val_mask = dataset.side_data['node_split']['val'][0].clone()
        data.test_mask = dataset.side_data['node_split']['test'][0].clone()
    elif dataset_key == "wikics":
        data.train_lb_mask = dataset.side_data['node_split']['train'][:, 0].clone()
        data.val_mask = dataset.side_data['node_split']['val'][:, 0].clone()
        data.test_mask = dataset.side_data['node_split']['test'].clone()
    
    # Map labels and features
    data.y = data.label_map
    data.x_text = data.x
    data.x = data.x_original
    if embedding_path is not None and os.path.exists(embedding_path):
        print("Loading embedding from: ", embedding_path)
        data.x = torch.load(embedding_path)
    
    
    # Add num_classes to data
    data.num_classes = dataset.num_classes
    data.train_ulb_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    

    if unlabel_ratio is not None and unlabel_ratio > 0:
        # Get indices of training nodes from the labeled training mask
        train_indices = data.train_lb_mask.nonzero(as_tuple=True)[0]
        
        # Get labels of training nodes
        train_labels = data.y[train_indices]
        
        # Initialize the mask for unlabeled training nodes
        data.train_ulb_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        
        class_label_counts = []  # Store labeled/unlabeled counts per class

        for cls in range(data.num_classes):
            # Get indices of training nodes belonging to the current class
            class_indices = train_indices[train_labels == cls]
            num_class_nodes = len(class_indices)
            
            # Calculate the number of nodes to unlabel (70%) and label (30%) for this class
            nodes_to_unlabel = int(unlabel_ratio * num_class_nodes)
            nodes_to_label = num_class_nodes - nodes_to_unlabel
            
            # Randomly select nodes to unlabel for this class
            unlabeled_indices = class_indices[torch.randperm(num_class_nodes)[:nodes_to_unlabel]]
            
            # Update the unlabeled mask
            data.train_ulb_mask[unlabeled_indices] = True
            
            # Count labeled and unlabeled samples for the class
            class_label_counts.append((cls, nodes_to_label, nodes_to_unlabel))
        
        # Update the labeled training mask
        data.train_lb_mask[data.train_ulb_mask] = False

    if print_info and unlabel_ratio:
        # Print the information about the unlabeled and labeled nodes
        print(f"Unlabeled ratio: {unlabel_ratio}")
        print(f"Labeled training nodes: {data.train_lb_mask.sum().item()}")
        print(f"Unlabeled training nodes: {data.train_ulb_mask.sum().item()}")
        
        # Print class-wise statistics
        print("\nClass-wise labeled and unlabeled counts:")
        for cls, num_labeled, num_unlabeled in class_label_counts:
            print(f"Class {cls}: Labeled = {num_labeled}, Unlabeled = {num_unlabeled}")
    
    # Retain only the required keys in the data object
    required_keys = [
        'x', 'y', 'train_lb_mask', 'train_ulb_mask', 
        'val_mask', 'test_mask', 'num_classes', 
        'num_features', 'x_text', 'edge_index', 'edge_attr'
    ]
    for k in list(data.keys()):
        if k not in required_keys:
            data.pop(k)

    return data

In [7]:
# Function to relabel low-confidence samples
def relabel_samples(low_conf_indices, data):
    num_classes = data.num_classes
    gold_label_prob = 0.7  # Probability of assigning the gold label

    # Generate random probabilities for each low-confidence sample
    random_probs = torch.rand(len(low_conf_indices))

    # Initialize new labels with random class labels
    random_labels = torch.randint(0, num_classes, (len(low_conf_indices),))

    # Assign gold labels with probability `gold_label_prob`
    new_labels = torch.where(
        random_probs < gold_label_prob,
        data.y[low_conf_indices],  # Gold labels
        random_labels  # Random class labels
    )

    return new_labels



In [8]:
# Function to relabel low-confidence samples
def llm_label_samples(low_conf_indices, data, dataset="cora", llm_id="Llama-3"):
    # print(f"Dataset: {dataset}")
    num_classes = data.num_classes
    llm_gen_file = os.path.join(llm_gen_dir, f"{dataset}_{llm_id}.tsv")
    df = pd.read_csv(llm_gen_file, sep='\t')
    y_gen = torch.tensor(df['llm_label'].values)
    y_gen = torch.where((y_gen >= 0) & (y_gen < num_classes) , y_gen, torch.zeros_like(y_gen))
    return y_gen[low_conf_indices]


In [9]:
import time
def train_step_semi_supervised(model, data, optimizer, x, y, alpha=0.1, th=0.5, temp=1, llm_label=False):
    model.train()
    optimizer.zero_grad()

    logits = model(x, data.edge_index)
    logits = logits / temp
    out_prob = F.softmax(logits, dim=1)

    labeled_loss = torch.tensor(0.0, device=x.device)
    consistency_loss = torch.tensor(0.0, device=x.device)
    
    num_low_conf_samples = 0

    # Labeled loss
    if data.train_lb_mask.sum() > 0:
        labeled_loss = F.cross_entropy(logits[data.train_lb_mask], y[data.train_lb_mask])

    # Consistency loss and relabeling
    if data.train_ulb_mask.sum() > 0 and llm_label:
        pseudo_labels = out_prob[data.train_ulb_mask].argmax(dim=1)
        confidence_scores = out_prob[data.train_ulb_mask].max(dim=1).values
        confident_mask = confidence_scores > th
        low_conf_mask = ~confident_mask
        
        print(f"Confidence Scores: {confidence_scores}")
        # time.sleep(10)
        # print(f"Confident Mask: {confident_mask}")
        # print(f"Low Confidence Mask: {low_conf_mask}")

        confident_indices = data.train_ulb_mask.nonzero(as_tuple=True)[0][confident_mask]
        low_conf_indices = data.train_ulb_mask.nonzero(as_tuple=True)[0][low_conf_mask]
        num_low_conf_samples = len(low_conf_indices)
        # print(f"Num low Confident Indices: {num_low_conf_samples}")

        # Consistency loss for high-confidence samples
        if len(confident_indices) > 0:
            consistency_loss = F.cross_entropy(logits[confident_indices], pseudo_labels[confident_mask])

        # Relabel low-confidence samples
        if len(low_conf_indices) > 0:
            # new_labels = relabel_samples(low_conf_indices, data)
            new_labels = llm_label_samples(low_conf_indices, data, dataset=DATASET, llm_id=LLM_ID)

            # Create new masks instead of modifying in-place
            new_train_lb_mask = data.train_lb_mask.clone()
            new_train_ulb_mask = data.train_ulb_mask.clone()

            # Update the masks with relabeled samples
            new_train_lb_mask[low_conf_indices] = True
            new_train_ulb_mask[low_conf_indices] = False

            # Assign the new labels and masks
            y[low_conf_indices] = new_labels
            data.train_lb_mask = new_train_lb_mask
            data.train_ulb_mask = new_train_ulb_mask

    # Total loss
    loss = labeled_loss + alpha * consistency_loss
    loss.backward()
    optimizer.step()

    return num_low_conf_samples, loss.item(), labeled_loss.item(), consistency_loss.item(), y, data.train_lb_mask, data.train_ulb_mask


In [10]:
# Validation step
def validation_step(model, data, x, y):
    model.eval()
    with torch.no_grad():
        out = model(x, data.edge_index)
        loss = F.cross_entropy(out[data.val_mask], y[data.val_mask])
        pred = out[data.val_mask].argmax(dim=1)
        acc = (pred == y[data.val_mask]).sum() / data.val_mask.sum()
    return loss.item(), acc.item()


In [11]:
# Test step
def test_step(model, data, x, y):
    model.eval()
    with torch.no_grad():
        out = model(x, data.edge_index)
        pred = out[data.test_mask].argmax(dim=1)
        acc = (pred == y[data.test_mask]).sum() / data.test_mask.sum()
    return acc.item()


In [12]:
# Main training loop
def train_model_semi_supervised(data, num_epochs=250, lr=0.01, hidden_channels=16, alpha=0.1, th=0.5, temp=1, print_logs=True):
    model = GCC(num_features=data.num_features, hidden_channels=hidden_channels, num_classes=data.num_classes)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)

    x, y = data.x, data.y
    # y = get_noisy_labels(data, noise_ratio=0.5)
    
    best_val_acc = 0
    best_model_path = os.path.join(saved_model_dir, 'best_model_ssl.pth')

    for epoch in range(1, num_epochs + 1):
        llm_label = True if (epoch % 50) == 0 else False
        num_low_conf_samples, total_loss, labeled_loss, consistency_loss, y, data.train_lb_mask, data.train_ulb_mask = train_step_semi_supervised(model, data, optimizer, x, y, alpha, th, temp=temp, llm_label=llm_label)
        val_loss, val_acc = validation_step(model, data, x, y)

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), best_model_path)

        if (epoch % 10 == 0 or epoch == num_epochs) and print_logs:
            print(f'Epoch: {epoch:03d},Low Conf Samples: {num_low_conf_samples}')
            # print(f'Epoch: {epoch:03d},Low Conf Samples: {num_low_conf_samples}, '
            #       f'Total Loss: {total_loss:.4f}, Labeled Loss: {labeled_loss:.4f}, Consistency Loss: {consistency_loss:.4f}, '
            #       f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')

    model.load_state_dict(torch.load(best_model_path, weights_only=True))
    test_acc = test_step(model, data, x, y)
    return best_val_acc, test_acc



In [13]:
def train_model_supervised(data, num_epochs=200, learning_rate=0.01, hidden_channels=16, print_logs=True):
    # Initialize model and optimizer
    model = GCC(num_features=data.num_features, hidden_channels=hidden_channels, num_classes=data.num_classes)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Extract features and labels
    x = data.x
    y = data.y

    # Training, validation, and test masks
    train_mask = data.train_lb_mask
    val_mask = data.val_mask
    test_mask = data.test_mask

    # Track the best model and accuracy
    best_val_acc = 0.0
    # best_model_state = None
    best_model_path = os.path.join(saved_model_dir, 'best_model_sup.pth')

    # Training loop
    for epoch in range(num_epochs):
        # Set model to training mode
        model.train()
        optimizer.zero_grad()

        # Forward pass and compute loss
        out = model(x, data.edge_index)
        # print(f"out[train_mask] shape: {out[train_mask].shape}")
        # print(f"y[train_mask] shape: {y[train_mask].shape}")
        # import time
        # time.sleep(5)
        loss = F.cross_entropy(out[train_mask], y[train_mask])
        
        # Backward pass and update
        loss.backward()
        optimizer.step()

        # Validation step
        val_loss, val_acc = validation_step(model, data, x, y)

        # Update best model if validation accuracy improves
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            # best_model_state = model.state_dict()  # Save best model parameters
            torch.save(model.state_dict(), best_model_path)

        # Print training and validation results
        if (epoch + 1) % 10 == 0 and print_logs:
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    # Load the best model parameters before testing
    model.load_state_dict(torch.load(best_model_path, weights_only=True))
    # model.load_state_dict(best_model_state)
    test_acc = test_step(model, data, x, y)
    # print(f"Best Validation Accuracy: {best_val_acc:.4f}")
    # print(f"Test Accuracy (Best Model): {test_acc:.4f}")
    
    return best_val_acc, test_acc


In [14]:
from train_eval import run_ssl, run_supervised

In [15]:
dataset_key = dataset_key_dict[DATASET]

# Experiment with varying unlabeled ratios
results = []
for ulb_ratio in [0, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]:

    #set embedding path to None to use the original features
    data_ssl = load_taglas_dataset(dataset_key=dataset_key, unlabel_ratio=ulb_ratio,embedding_path=None, print_info=False)
    
    # run these two lines for basic GCC
    _, test_acc_sup = train_model_supervised(data_ssl, print_logs=False)
    _, test_acc_ssl = train_model_semi_supervised(data_ssl, alpha=0.1, th=0.7, temp=1, print_logs=False)
    
    #run these two for SOTA GCN
    # test_acc_sup = run_supervised(data_ssl, device, dataset=DATASET, print_logs=False) 
    # test_acc_ssl = run_ssl(data_ssl, device, dataset=DATASET, alpha=0.1, th=0.7, print_logs=False)
    
    
    results.append({'ulb_ratio': ulb_ratio, 'supervised_acc': test_acc_sup, 'ssl_acc': test_acc_ssl})


  self.data, self.slices = torch.load(self.processed_paths[0])
  self.side_data = torch.load(self.processed_paths[1])


Confidence Scores: tensor([0.3745, 0.7594, 0.9412, 0.4616, 0.5552, 0.3798, 0.4260, 0.3753, 0.7723,
        0.7475, 0.8288, 0.3107, 0.2877, 0.3678, 0.9804, 0.6857, 0.5938, 0.3392,
        0.5292, 0.4389, 0.2801, 0.2762, 0.4772, 0.2541, 0.3294, 0.3758, 0.9052,
        0.3703, 0.9454, 0.2252, 0.8216, 0.3439, 0.8062, 0.3941, 0.3547, 0.2681,
        0.2586, 0.9535, 0.2355, 0.4924, 0.3315, 0.5188, 0.2913, 0.7205, 0.7047,
        0.6513, 0.4364, 0.3716, 0.5266, 0.4829, 0.3336, 0.9225, 0.4868, 0.3118,
        0.4258, 0.4228, 0.3439, 0.6168, 0.6870, 0.4410, 0.7277, 0.6600, 0.5509,
        0.4208, 0.2373, 0.4286, 0.7700, 0.7752, 0.5182, 0.7305, 0.7457, 0.5443,
        0.6979, 0.6832, 0.2303, 0.4374, 0.7843, 0.5522, 0.7202, 0.4434, 0.5631,
        0.2324, 0.3537, 0.2647, 0.3812, 0.3763, 0.3574, 0.3419, 0.2407, 0.5647,
        0.3182, 0.7711, 0.7943, 0.3772, 0.6566, 0.3925, 0.4487, 0.2784],
       grad_fn=<MaxBackward0>)
Confidence Scores: tensor([0.5546, 0.9293, 0.6389, 0.5961, 0.9194, 0.9432, 0.

  self.data, self.slices = torch.load(self.processed_paths[0])
  self.side_data = torch.load(self.processed_paths[1])


Confidence Scores: tensor([0.4056, 0.8073, 0.9380, 0.4502, 0.5963, 0.2857, 0.4361, 0.4045, 0.8208,
        0.7435, 0.8158, 0.7945, 0.3073, 0.2596, 0.3154, 0.9789, 0.7100, 0.5483,
        0.2576, 0.3953, 0.4979, 0.2644, 0.6144, 0.2772, 0.4736, 0.2794, 0.2785,
        0.3897, 0.8406, 0.6637, 0.4092, 0.9361, 0.2611, 0.5231, 0.3481, 0.4936,
        0.3700, 0.3535, 0.2247, 0.2475, 0.9506, 0.2286, 0.4917, 0.3879, 0.4739,
        0.2782, 0.6834, 0.7148, 0.6073, 0.3365, 0.3445, 0.5042, 0.4124, 0.3843,
        0.4012, 0.9265, 0.5031, 0.3337, 0.4314, 0.5011, 0.2799, 0.5703, 0.7238,
        0.3704, 0.6888, 0.5870, 0.5603, 0.3875, 0.5614, 0.4646, 0.2829, 0.3872,
        0.7524, 0.8017, 0.3694, 0.6872, 0.6864, 0.4918, 0.6572, 0.4904, 0.2175,
        0.4481, 0.7635, 0.9358, 0.5238, 0.4820, 0.4123, 0.5955, 0.2578, 0.4054,
        0.3392, 0.4137, 0.2248, 0.2897, 0.3401, 0.4394, 0.5217, 0.3355, 0.7379,
        0.6948, 0.3674, 0.4594, 0.3715, 0.3883, 0.2697],
       grad_fn=<MaxBackward0>)
Confidence Sc

  self.data, self.slices = torch.load(self.processed_paths[0])
  self.side_data = torch.load(self.processed_paths[1])


Confidence Scores: tensor([0.4365, 0.6148, 0.9442, 0.4323, 0.2566, 0.3533, 0.4575, 0.4762, 0.7314,
        0.6677, 0.7934, 0.7660, 0.2889, 0.2812, 0.2920, 0.9761, 0.7232, 0.5864,
        0.7257, 0.2706, 0.4120, 0.5057, 0.2593, 0.5235, 0.2176, 0.4796, 0.2345,
        0.3178, 0.3782, 0.8358, 0.5864, 0.4913, 0.9291, 0.2748, 0.4744, 0.2551,
        0.3660, 0.3291, 0.3373, 0.2205, 0.3774, 0.7739, 0.2850, 0.6315, 0.3403,
        0.6002, 0.2259, 0.4354, 0.2847, 0.4847, 0.6478, 0.6636, 0.6470, 0.2275,
        0.3308, 0.4763, 0.3054, 0.3296, 0.4382, 0.8475, 0.5162, 0.3639, 0.2757,
        0.7601, 0.6868, 0.2592, 0.3517, 0.7045, 0.2454, 0.4801, 0.5699, 0.4558,
        0.4090, 0.4116, 0.5273, 0.2958, 0.3791, 0.6945, 0.7552, 0.7888, 0.4365,
        0.6684, 0.6546, 0.6123, 0.5318, 0.4037, 0.2142, 0.3281, 0.7468, 0.8930,
        0.5012, 0.3657, 0.3287, 0.5660, 0.4668, 0.6201, 0.3621, 0.3792, 0.2295,
        0.2451, 0.5514, 0.3106, 0.5165, 0.2567, 0.3581, 0.4755, 0.5717, 0.3755,
        0.7728, 0.392

  self.data, self.slices = torch.load(self.processed_paths[0])
  self.side_data = torch.load(self.processed_paths[1])


Confidence Scores: tensor([0.3398, 0.5542, 0.6911, 0.9338, 0.5154, 0.2521, 0.4009, 0.3629, 0.5431,
        0.3567, 0.7331, 0.6385, 0.3479, 0.2949, 0.7848, 0.3125, 0.3398, 0.2913,
        0.9654, 0.8442, 0.5537, 0.4371, 0.2714, 0.4373, 0.5400, 0.2549, 0.5479,
        0.3033, 0.5471, 0.8144, 0.2426, 0.3148, 0.3859, 0.7961, 0.3747, 0.3652,
        0.5597, 0.8743, 0.3408, 0.3777, 0.5083, 0.2492, 0.2737, 0.3708, 0.3358,
        0.3306, 0.4033, 0.7442, 0.2304, 0.5448, 0.3911, 0.6126, 0.2214, 0.5339,
        0.2881, 0.2525, 0.7105, 0.4144, 0.7324, 0.2746, 0.3159, 0.4861, 0.2820,
        0.3208, 0.4977, 0.8406, 0.3495, 0.4311, 0.3489, 0.8113, 0.7340, 0.2558,
        0.2150, 0.6198, 0.2249, 0.2617, 0.6194, 0.5259, 0.4574, 0.4393, 0.3017,
        0.3226, 0.3698, 0.4767, 0.3894, 0.4217, 0.4923, 0.3306, 0.4140, 0.6778,
        0.5234, 0.3992, 0.2772, 0.3810, 0.3038, 0.5765, 0.3954, 0.3073, 0.3236,
        0.4062, 0.5374, 0.4643, 0.6396, 0.5079, 0.4601, 0.2989, 0.2545, 0.5089,
        0.2937, 0.370

  self.data, self.slices = torch.load(self.processed_paths[0])
  self.side_data = torch.load(self.processed_paths[1])


Confidence Scores: tensor([0.3320, 0.5733, 0.3636, 0.8371, 0.4579, 0.2577, 0.3738, 0.3495, 0.4565,
        0.4031, 0.6666, 0.6113, 0.4633, 0.2780, 0.4290, 0.3901, 0.3133, 0.3167,
        0.9271, 0.2200, 0.5078, 0.4835, 0.2941, 0.4491, 0.4380, 0.2637, 0.4948,
        0.2274, 0.3030, 0.6381, 0.2670, 0.2222, 0.3899, 0.3482, 0.6358, 0.2674,
        0.4091, 0.4962, 0.8821, 0.3178, 0.3836, 0.4177, 0.2606, 0.3091, 0.2909,
        0.3853, 0.3191, 0.3579, 0.6231, 0.2565, 0.5201, 0.4173, 0.6628, 0.2385,
        0.4855, 0.2534, 0.2919, 0.3085, 0.4950, 0.3894, 0.7444, 0.3995, 0.3824,
        0.4013, 0.2716, 0.2530, 0.2619, 0.5084, 0.6891, 0.3819, 0.3720, 0.3080,
        0.8615, 0.7596, 0.2274, 0.2330, 0.4928, 0.2703, 0.2933, 0.5754, 0.3658,
        0.4595, 0.4406, 0.3012, 0.3210, 0.3422, 0.2718, 0.4269, 0.4567, 0.4430,
        0.5806, 0.4048, 0.4443, 0.6209, 0.7165, 0.3623, 0.3057, 0.2428, 0.2766,
        0.3546, 0.3835, 0.5341, 0.3786, 0.3188, 0.2229, 0.5265, 0.3665, 0.2431,
        0.6516, 0.370

  self.data, self.slices = torch.load(self.processed_paths[0])
  self.side_data = torch.load(self.processed_paths[1])


Confidence Scores: tensor([0.3181, 0.3674, 0.3072, 0.1753, 0.2762, 0.2918, 0.2944, 0.2492, 0.4428,
        0.2419, 0.3999, 0.3807, 0.5434, 0.4089, 0.2080, 0.3823, 0.2593, 0.3760,
        0.3136, 0.2505, 0.3059, 0.1906, 0.3652, 0.3085, 0.2152, 0.3751, 0.5506,
        0.3205, 0.4227, 0.2985, 0.3431, 0.5610, 0.2847, 0.2118, 0.4081, 0.5018,
        0.6303, 0.2265, 0.3093, 0.4009, 0.8506, 0.3034, 0.2926, 0.4432, 0.2043,
        0.2260, 0.2953, 0.3346, 0.2165, 0.4163, 0.4260, 0.3427, 0.2736, 0.3071,
        0.3432, 0.6410, 0.2322, 0.3129, 0.2254, 0.3212, 0.2517, 0.3392, 0.2706,
        0.3557, 0.2374, 0.2693, 0.2027, 0.4050, 0.2948, 0.2820, 0.5884, 0.2816,
        0.6164, 0.3162, 0.3157, 0.3647, 0.8386, 0.7200, 0.3661, 0.3463, 0.2626,
        0.2323, 0.2828, 0.3200, 0.3538, 0.3843, 0.3441, 0.4366, 0.4123, 0.5005,
        0.4096, 0.3933, 0.4438, 0.3930, 0.2747, 0.2503, 0.5377, 0.2954, 0.3877,
        0.5797, 0.2959, 0.3161, 0.3962, 0.1845, 0.3461, 0.2976, 0.4033, 0.3818,
        0.2722, 0.379

In [16]:
# Save the results to a CSV file
results_dir = "/home/mrahma56/cs519/SSL_LLM_Node_Classification/results"
results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(results_dir, f"results_{DATASET}_{LLM_ID}_{SEED}.csv"), index=False)

In [17]:
for r in results:
    print(f"Unlabeled Ratio: {r['ulb_ratio']:.2f}, Supervised Acc: {r['supervised_acc']:.4f}, SSL Acc: {r['ssl_acc']:.4f}")

Unlabeled Ratio: 0.00, Supervised Acc: 0.7984, SSL Acc: 0.8230
Unlabeled Ratio: 0.70, Supervised Acc: 0.6847, SSL Acc: 0.7142
Unlabeled Ratio: 0.75, Supervised Acc: 0.7036, SSL Acc: 0.6721
Unlabeled Ratio: 0.80, Supervised Acc: 0.7113, SSL Acc: 0.6572
Unlabeled Ratio: 0.85, Supervised Acc: 0.5638, SSL Acc: 0.6567
Unlabeled Ratio: 0.90, Supervised Acc: 0.5242, SSL Acc: 0.6456
Unlabeled Ratio: 0.95, Supervised Acc: 0.3380, SSL Acc: 0.6069
