In [1]:
import torch
from torch_geometric import seed_everything
seed_everything(42)
import torch_geometric.transforms as T
from src.transforms.per_user_neg_sampling import add_negative_test_edges_per_user
from src.evaluation.ranking_metrics import evaluate_ranking_metrics, evaluate_ranking_metrics_PyG

# Let's start by loading the data
data = torch.load("data/hetero_data_no_coauthor.pt", weights_only=False)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
transform = T.RandomLinkSplit(
    num_val=0.1, # Validation set percentage
    num_test=0.1, #entage
    disjoint_train_ratio=0.3, # Percentage of training edges used for supervision, these will not be used for message passing
    neg_sampling_ratio=2.0, # Ratio of negative to posit Test set perceive edges for validation and testing, don't know how this is related to `add_negative_train_samples`, need to check later
    add_negative_train_samples=False, # AYYY NO idea, why this set to False, but somehow it works worse with True ???, Need it investigate later, Prolly because we do LinkNeighborLoader which samples neg edges for us?
    edge_types=("author", "writes", "paper"), # Any ways, these are the edge types we want to predict
    rev_edge_types=("paper", "rev_writes", "author"), # Reverse edge types, so we dont accidentally bleed information into validation/test set
)

train_data, val_data, test_data = transform(data)
test_data = add_negative_test_edges_per_user(test_data)
# Introduces a different type of negative edges

In [3]:
train_data[("author", "writes", "paper")].edge_label.mean()

tensor(1.)

In [4]:
# Models to evaluate
from src.models.HeteroGCNModel import HeteroGCNModel
from src.models.TBBaselineModel import TBBaselineModel

import torch

# Fix the random seed

# Checkpoints
model_checkpoints = {
    "HGCN": "checkpoints/modelGNN_weights.pkl",
    "TB": "checkpoints/baseline_weights.pkl"
}

model_settings = {
    "HGCN": {
        "hidden_channels": 256,
        "data": test_data
    },
    "TB": {
        "hidden_channels": 256,
        "data": test_data
    }
}

model_classes = {
    "HGCN": HeteroGCNModel,
    "TB": TBBaselineModel
}

models = {key: model_classes[key](**model_settings[key]) for key in model_checkpoints.keys()}

# load model weights
for key in model_checkpoints.keys():
    models[key].load_state_dict(torch.load(model_checkpoints[key], map_location=torch.device('cpu')))


  models[key].load_state_dict(torch.load(model_checkpoints[key], map_location=torch.device('cpu')))


In [5]:
def evaluate_model(model, data):
    model.eval()
    with torch.no_grad():
        y_pred = model(data)

    y_pred = y_pred.cpu().numpy()
    y_true = data["author", "writes", "paper"].edge_label.cpu().numpy()

    # binary thresholding at 0.5
    y_pred = (y_pred >= 0.5)

    FP = ((y_true == 0) & (y_pred == 1)).sum().item()
    TP = ((y_true == 1) & (y_pred == 1)).sum().item()
    FN = ((y_true == 1) & (y_pred == 0)).sum().item()
    TN = ((y_true == 0) & (y_pred == 0)).sum().item()

    precision = TP / (TP + FP + 1e-8)
    recall = TP / (TP + FN + 1e-8)
    f1_score = 2 * (precision * recall) / (precision + recall + 1e-8)
    accuracy = (TP + TN) / (TP + TN + FP + FN + 1e-8)

    return precision, recall, f1_score, accuracy

def dump_quick_model_metrics(model):
    # just a safety check to compare with outputs of example_training.ipynb
    precision, recall, f1_score, accuracy = evaluate_model(model, test_data)
    # dump the model weights into a file
    print("Evaluating on Test set...")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print("--------------------------------------------------")
    precision, recall, f1_score, accuracy = evaluate_model(model, val_data)
    print("Evaluating on validation set...")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    precision, recall, f1_score, accuracy = evaluate_model(model, train_data)
    print("Evaluating on TRAINING set...")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    print(f"Accuracy: {accuracy:.4f}")

In [6]:

#for model_name, model in models.items():
#    print(f"Evaluating model: {model_name}")
#    dump_quick_model_metrics(model)


In [None]:
# ranking metrics
Ks = (4, 12)
metrics = {}
for model_name, model in models.items():
    model.eval()
    metrics[model_name] = evaluate_ranking_metrics(model, test_data, ks=Ks)

In [8]:
metrics.keys()

dict_keys(['HGCN', 'TB'])

In [9]:
metrics["TB"]

{'num_heads': 21286,
 'MRR': 0.38521655602445115,
 'MAP': 0.3483839149593717,
 'Hits@4': 0.4803157004603965,
 'Precision@4': 0.14590575965423283,
 'Recall@4': 0.4044147831025749,
 'F1@4': 0.2006485313770315,
 'MAP@4': 0.297873471869878,
 'NDCG@4': 0.33975640091403914,
 'Hits@12': 0.658507939490745,
 'Precision@12': 0.07570312255316484,
 'Recall@12': 0.6015065024808071,
 'F1@12': 0.12807076795413488,
 'MAP@12': 0.32930342986081457,
 'NDCG@12': 0.407864963123223}

In [10]:
metrics["HGCN"]

{'num_heads': 21286,
 'MRR': 0.36007932924305486,
 'MAP': 0.3293273081550687,
 'Hits@4': 0.47275204359673023,
 'Precision@4': 0.14474302358357605,
 'Recall@4': 0.39924797822115615,
 'F1@4': 0.19853775869227155,
 'MAP@4': 0.27358324981469295,
 'NDCG@4': 0.3194805192211574,
 'Hits@12': 0.6780982805599924,
 'Precision@12': 0.07881158194744589,
 'Recall@12': 0.6250369472362758,
 'F1@12': 0.13321712178066542,
 'MAP@12': 0.3105351989435243,
 'NDCG@12': 0.39823914369554286}

In [11]:
def make_latex_table(metrics: dict) -> str:
    lines = []
    lines.append("\\begin{table}[h]")
    lines.append("\\centering")
    lines.append("\\begin{tabular}{c|cc|cc|cc|cc}")
    lines.append("\\toprule")
    lines.append("\\multirow{2}{*}{\\textbf{Model}} &")
    lines.append("\\multicolumn{2}{c|}{\\textbf{MAP}} &")
    lines.append("\\multicolumn{2}{c|}{\\textbf{Precision}} &")
    lines.append("\\multicolumn{2}{c|}{\\textbf{Recall}} &")
    lines.append("\\multicolumn{2}{c}{\\textbf{F1}} \\\\")
    lines.append("& @4 & @12 & @4 & @12 & @4 & @12 & @4 & @12 \\\\")
    lines.append("\\midrule")

    for model, vals in metrics.items():
        row = (
            f"{model} & "
            f"{vals.get('MAP@4', 0):.3f} & {vals.get('MAP@12', 0):.3f} & "
            f"{vals.get('Precision@4', 0):.3f} & {vals.get('Precision@12', 0):.3f} & "
            f"{vals.get('Recall@4', 0):.3f} & {vals.get('Recall@12', 0):.3f} & "
            f"{vals.get('F1@4', 0):.3f} & {vals.get('F1@12', 0):.3f} \\\\"
        )
        lines.append(row)

    lines.append("\\bottomrule")
    lines.append("\\end{tabular}")
    lines.append("\\vspace{0.1in}")
    lines.append("\\caption{The specific metrics were chosen based on the lecture.}")
    lines.append("\\label{tbl:initial_metrics}")
    lines.append("\\end{table}")
    return "\n".join(lines)


In [12]:
print(make_latex_table(metrics))

\begin{table}[h]
\centering
\begin{tabular}{c|cc|cc|cc|cc}
\toprule
\multirow{2}{*}{\textbf{Model}} &
\multicolumn{2}{c|}{\textbf{MAP}} &
\multicolumn{2}{c|}{\textbf{Precision}} &
\multicolumn{2}{c|}{\textbf{Recall}} &
\multicolumn{2}{c}{\textbf{F1}} \\
& @4 & @12 & @4 & @12 & @4 & @12 & @4 & @12 \\
\midrule
HGCN & 0.274 & 0.311 & 0.145 & 0.079 & 0.399 & 0.625 & 0.199 & 0.133 \\
TB & 0.298 & 0.329 & 0.146 & 0.076 & 0.404 & 0.602 & 0.201 & 0.128 \\
\bottomrule
\end{tabular}
\vspace{0.1in}
\caption{The specific metrics were chosen based on the lecture.}
\label{tbl:initial_metrics}
\end{table}


In [13]:
# check test data edge labels - lots of negatives
test_data["author", "writes", "paper"].edge_label.mean()

tensor(0.0148)

In [14]:
evaluate_ranking_metrics_PyG(models["HGCN"], test_data, ks=Ks)

AttributeError: 'NoneType' object has no attribute 'compute'