In [21]:
from pykeen.pipeline import pipeline
from pykeen.datasets import Nations, get_dataset
import torch
from pykeen.evaluation import evaluate, RankBasedEvaluator
from pykeen.metrics.ranking import HitsAtK
import pandas as pd


import logging
from pathlib import Path

import click
import more_click
import torch
from pykeen.evaluation import RankBasedEvaluator
from pykeen.losses import NSSALoss,CrossEntropyLoss
from pykeen.models.inductive import InductiveNodePiece, InductiveNodePieceGNN
from pykeen.trackers import ConsoleResultTracker, WANDBResultTracker
from pykeen.training import SLCWATrainingLoop
from pykeen.typing import TESTING, TRAINING, VALIDATION
from pykeen.utils import resolve_device, set_random_seed
from torch.optim import Adam


from pykeen.metrics.ranking import HitsAtK

from pathlib import Path

from pykeen.datasets.inductive.base import DisjointInductivePathDataset
from typing_extensions import Literal
import os
from pykeen.hpo import hpo_pipeline
from pykeen.triples import TriplesFactory
from pykeen.models import InductiveNodePiece
from pykeen.typing import TESTING, TRAINING, VALIDATION

import time

import platform

import sys

import cpuinfo

import psutil

import subprocess

import zipfile

seed = 1234

In [2]:
class InductiveLPDataset(DisjointInductivePathDataset):
    """An inductive link prediction dataset for the ILPC 2022 Challenge."""

    
    
    
    def __init__(self , **kwargs):
        """Initialize the inductive link prediction dataset.

        :param size: "small" or "large"
        :param kwargs: keyword arguments to forward to the base dataset class, cf. DisjointInductivePathDataset
        """
        DATA_TYPE = "_fully_inductive.tsv"
        TRAIN_PATH = "MSCallGraph_train" + DATA_TYPE
        TEST_PATH = "MSCallGraph_test" + DATA_TYPE
        VALIDATE_PATH = "MSCallGraph_validation" + DATA_TYPE
        INFERENCE_PATH = "MSCallGraph_inference" + DATA_TYPE


        super().__init__(
            transductive_training_path=os.getcwd()+"/"+TRAIN_PATH,
            inductive_inference_path=os.getcwd()+"/"+INFERENCE_PATH,
            inductive_validation_path=os.getcwd()+"/"+VALIDATE_PATH,
            inductive_testing_path=os.getcwd()+"/"+TEST_PATH,
            create_inverse_triples=True,
            eager=True,
            **kwargs
        )


In [3]:
def show_metrics(dictionary,model_name,csv_name):
    for key in dictionary.keys():
        print(key)
        df = pd.DataFrame(dictionary[key])
        df.to_csv(f"{model_name}/{model_name}_{csv_name}_{key}.csv")
        print(df)

In [4]:
dataset = InductiveLPDataset()

In [5]:
tracker = ConsoleResultTracker()

In [6]:
loss = NSSALoss() #used by RotatE and NodePiece
num_tokens = 20
embedding_dim = 200

In [7]:
model_name = 'nodepiece_inductive'
model = InductiveNodePiece(
        triples_factory=dataset.transductive_training,
        inference_factory=dataset.inductive_inference,
        random_seed = seed,
        loss = loss,
        num_tokens = num_tokens,
        embedding_dim = embedding_dim
    ).to(resolve_device())
print(f"Number of parameters: {sum(p.numel() for p in model.parameters())}")
print(f"Space occupied: {model.num_parameter_bytes} bytes")

sampling:   0%|          | 0.00/9.06k [00:00<?, ?it/s]

No symbolic computation of output shape.


sampling:   0%|          | 0.00/3.79k [00:00<?, ?it/s]

No symbolic computation of output shape.
No cuda devices were available. The model runs on CPU


Number of parameters: 2600
Space occupied: 10400 bytes


In [8]:
directory = model_name

if not os.path.exists(directory):
    os.makedirs(directory)
    print(f'Directory {directory} created successfully!')
else:
    print(f'Directory {directory} already exists.')

Directory nodepiece_inductive created successfully!


In [9]:
learning_rate = 1e-4
optimizer = Adam(params=model.parameters(), lr=learning_rate)
num_epochs = 200
patience = 20

In [10]:
metrics = ['meanreciprocalrank', HitsAtK(1),
                 HitsAtK(3), HitsAtK(5), HitsAtK(10)]

train_evaluator = RankBasedEvaluator(
        mode=TRAINING,
        metrics=metrics,
        add_defaults=False,
    )
valid_evaluator = RankBasedEvaluator(
        mode=VALIDATION,
        metrics=metrics,
        add_defaults=False,
    )
test_evaluator = RankBasedEvaluator(
        mode=TESTING,
        metrics = metrics,
        add_defaults=False
    )

In [11]:
from pykeen.stoppers import EarlyStopper

stopper = EarlyStopper(
    model = model,
    metric='meanreciprocalrank',
    patience=patience,
    frequency=1,
    evaluator = valid_evaluator,
    training_triples_factory = dataset.inductive_inference,
    evaluation_triples_factory = dataset.inductive_validation,
    result_tracker = tracker

)



In [12]:
# default training regime is negative sampling (SLCWA)
# you can also use the 1-N regime with the LCWATrainingLoop
# the LCWA loop does not need negative sampling kwargs, but accepts label_smoothing in the .train() method
training_loop = SLCWATrainingLoop(
        triples_factory=dataset.transductive_training,
        model=model,
        mode=TRAINING,  # must be specified for the inductive setup
        result_tracker=tracker,
        optimizer=optimizer
)

In [13]:
training_start = time.time()
train_epoch =  training_loop.train(
        triples_factory=dataset.transductive_training,
        num_epochs=num_epochs,
        callbacks="evaluation",
        callback_kwargs=dict(
            evaluator=valid_evaluator,
            evaluation_triples=dataset.inductive_validation.mapped_triples,
            prefix="validation",
            frequency=1,
            additional_filter_triples=dataset.inductive_inference.mapped_triples,
        ),
        stopper = stopper
        
    )
training_duration = time.time() - training_start

Training epochs on cpu:   0%|          | 0/1 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/184 [00:00<?, ?batch/s]

Evaluating on cpu:   0%|          | 0.00/3.13k [00:00<?, ?triple/s]

Step: 1
Metric: validation.head.optimistic.inverse_harmonic_mean_rank = 0.012906042211660406
Metric: validation.tail.optimistic.inverse_harmonic_mean_rank = 0.03473410601969134
Metric: validation.both.optimistic.inverse_harmonic_mean_rank = 0.023820074115675872
Metric: validation.head.realistic.inverse_harmonic_mean_rank = 0.01227752584964037
Metric: validation.tail.realistic.inverse_harmonic_mean_rank = 0.031580857932567596
Metric: validation.both.realistic.inverse_harmonic_mean_rank = 0.021929193288087845
Metric: validation.head.pessimistic.inverse_harmonic_mean_rank = 0.011888409472556621
Metric: validation.tail.pessimistic.inverse_harmonic_mean_rank = 0.02987064980162328
Metric: validation.both.pessimistic.inverse_harmonic_mean_rank = 0.020879529637089955
Metric: validation.head.optimistic.hits_at_1 = 0.0063959066197633516
Metric: validation.tail.optimistic.hits_at_1 = 0.016309561880396548
Metric: validation.both.optimistic.hits_at_1 = 0.01135273425007995
Metric: validation.head.re

In [14]:
print("Train error per epoch:")
df = pd.DataFrame(train_epoch)
print(df)
df.to_csv(f"{model_name}/{model_name}_train_error_per_epoch.csv")

Train error per epoch:
          0
0  2.726568


In [15]:
training_evaluation_start = time.time()
# train
print("Train error")
show_metrics(train_evaluator.evaluate(
        model=model,
        mapped_triples=dataset.transductive_training.mapped_triples,
        additional_filter_triples=[
        dataset.transductive_training.mapped_triples,
    ]
    ).to_dict(),model_name,'train_metrics')
training_evaluation_duration = time.time() - training_evaluation_start

Train error


Evaluating on cpu:   0%|          | 0.00/23.5k [00:00<?, ?triple/s]

head
                            optimistic  realistic  pessimistic
inverse_harmonic_mean_rank    0.003790   0.003236     0.002936
hits_at_1                     0.000681   0.000681     0.000681
hits_at_3                     0.002811   0.001022     0.001022
hits_at_5                     0.004642   0.002555     0.002044
hits_at_10                    0.006516   0.005579     0.005452
tail
                            optimistic  realistic  pessimistic
inverse_harmonic_mean_rank    0.009647   0.008171     0.007412
hits_at_1                     0.002385   0.001533     0.001533
hits_at_3                     0.003663   0.003663     0.003663
hits_at_5                     0.008731   0.004131     0.004131
hits_at_10                    0.014311   0.011926     0.007453
both
                            optimistic  realistic  pessimistic
inverse_harmonic_mean_rank    0.006718   0.005703     0.005174
hits_at_1                     0.001533   0.001107     0.001107
hits_at_3                     0.003237  

In [16]:
validation_evaluation_start = time.time()
# validation
print("Validation error")
show_metrics(valid_evaluator.evaluate(
        model=model,
        mapped_triples=dataset.inductive_validation.mapped_triples,
        additional_filter_triples=[
            # filtering of other positive triples
            dataset.inductive_inference.mapped_triples
        ],
    ).to_dict(),model_name,'validation_metrics')
validation_evaluation_duration = time.time() - validation_evaluation_start

Validation error


Evaluating on cpu:   0%|          | 0.00/3.13k [00:00<?, ?triple/s]

head
                            optimistic  realistic  pessimistic
inverse_harmonic_mean_rank    0.012906   0.012278     0.011888
hits_at_1                     0.006396   0.006396     0.006396
hits_at_3                     0.011832   0.011513     0.011513
hits_at_5                     0.013431   0.013112     0.013112
hits_at_10                    0.016310   0.016310     0.015670
tail
                            optimistic  realistic  pessimistic
inverse_harmonic_mean_rank    0.034734   0.031581     0.029871
hits_at_1                     0.016310   0.015990     0.015990
hits_at_3                     0.028142   0.019827     0.019827
hits_at_5                     0.031340   0.029421     0.028142
hits_at_10                    0.078989   0.071634     0.034538
both
                            optimistic  realistic  pessimistic
inverse_harmonic_mean_rank    0.023820   0.021929     0.020880
hits_at_1                     0.011353   0.011193     0.011193
hits_at_3                     0.019987  

In [17]:
testing_evaluation_start = time.time()
# result on the test set
print("Test error")
show_metrics(test_evaluator.evaluate(
        model=model,
        mapped_triples=dataset.inductive_testing.mapped_triples,
        additional_filter_triples=[
            # filtering of other positive triples
            dataset.inductive_inference.mapped_triples,
            dataset.inductive_validation.mapped_triples,
        ],
    ).to_dict(),model_name,'test_metrics')
testing_evaluation_duration = time.time() - testing_evaluation_start

Test error


Evaluating on cpu:   0%|          | 0.00/3.98k [00:00<?, ?triple/s]

head
                            optimistic  realistic  pessimistic
inverse_harmonic_mean_rank    0.013470   0.012746     0.012298
hits_at_1                     0.006533   0.006533     0.006533
hits_at_3                     0.013065   0.013065     0.013065
hits_at_5                     0.014573   0.014322     0.014322
hits_at_10                    0.017337   0.017337     0.017337
tail
                            optimistic  realistic  pessimistic
inverse_harmonic_mean_rank    0.029931   0.026884     0.025214
hits_at_1                     0.013065   0.012060     0.012060
hits_at_3                     0.022362   0.016583     0.016332
hits_at_5                     0.024372   0.023116     0.022111
hits_at_10                    0.071357   0.064070     0.025377
both
                            optimistic  realistic  pessimistic
inverse_harmonic_mean_rank    0.021701   0.019815     0.018756
hits_at_1                     0.009799   0.009296     0.009296
hits_at_3                     0.017714  

In [18]:
infodict = {}
infodict['device'] = model.device
infodict['parameters bytes'] = model.num_parameter_bytes
infodict['number parameters'] = model.num_parameters
infodict['training duration'] = training_duration
infodict['training evaluation duration'] = training_evaluation_duration
infodict['validation evaluation duration'] = validation_evaluation_duration
infodict['testing evaluation duration'] = testing_evaluation_duration
infodict["Operating system name"] = platform.system()
infodict["Operating system version"] = platform.release()
infodict["Processor architecture"] = platform.machine()
infodict["Python version"] = sys.version
infodict["Processor model name"] = cpuinfo.get_cpu_info()['brand_raw']
infodict['Number cpu cores'] = os.cpu_count()
infodict["Total physical memory"] = psutil.virtual_memory().total

In [19]:

output = subprocess.check_output(['nvidia-smi', '--query-gpu=name', '--format=csv'])
output = output.decode('utf-8')  # convert byte string to regular string

# split output into rows and remove header row
rows = output.strip().split('\n')[1:]

# extract GPU names from each row
gpu_names = []
for row in rows:
    name = row.strip()
    gpu_names.append(name)

infodict['GPU'] = gpu_names[0]


In [20]:
info_df = pd.DataFrame(columns=['name','value'], data = infodict.items())
info_df.to_csv(f"{model_name}/{model_name}_information.csv")
print(info_df)

                              name  \
0                           device   
1                 parameters bytes   
2                number parameters   
3                training duration   
4     training evaluation duration   
5   validation evaluation duration   
6      testing evaluation duration   
7            Operating system name   
8         Operating system version   
9           Processor architecture   
10                  Python version   
11            Processor model name   
12                Number cpu cores   
13           Total physical memory   

                                                value  
0                                                 cpu  
1                                               10400  
2                                                2600  
3                                           42.500668  
4                                           318.45924  
5                                           15.159921  
6                                    

In [22]:
def zip_folder(folder_path, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                zipf.write(os.path.join(root, file))

folder_path = model_name
output_path = f'{model_name}.zip'

zip_folder(folder_path, output_path)