# Knoledge Graph Embedding with Literals

## modules

In [1]:
import pandas as pd
import numpy as np
from pykeen.pipeline import pipeline
from pykeen.datasets import get_dataset
from pykeen.models import DistMultLiteral
from util.databinder import DataBinder

  from tqdm.autonotebook import tqdm


In [2]:
def convert_dtype(value):
    if isinstance(value, np.floating):
        return float(value)
    elif isinstance(value, np.integer):
        return int(value)
    else:
        return value

## parameters

In [3]:
# for data
dir_data = './data/processed/20240627_fb15k237lit'

# for model
## mode with literal
## shoule be either distmultliteral or complexliteral
name_model_with_lit = 'distmultliteral'

## model without literal
name_model_without_lit = 'distmult'

# for training
num_epochs = 100

# for output
dir_model = './models/20240628_distmultliteral'

## main

### load data

In [4]:
db = DataBinder(target_dir=dir_data)

INFO:root:Loaded info from ./data/processed/20240627_fb15k237lit/info.json


### best parameters

In [5]:
f_params = '../benchmarking/df_best_param.pkl'
df_best_params_org = pd.read_pickle(f_params).reset_index()

In [6]:
df_best_params_org['model'].unique()

array(['transe', 'complex', 'conve'], dtype=object)

In [7]:
df_best_params = df_best_params_org[df_best_params_org['model'].isin(['complex'])]

In [8]:
df_best_params = df_best_params[df_best_params['dataset'].isin(['fb15k237'])]

In [9]:
df_best_params.T

Unnamed: 0,3
index,0
dataset,fb15k237
evaluator,rankbased
loss,crossentropy
metric,hits@10
...,...
hpo.pipeline.model_kwargs.input_channels,
hpo.pipeline.model_kwargs.embedding_height,
hpo.pipeline.model_kwargs.embedding_width,
hpo.pipeline.model_kwargs.kernel_height,


In [10]:
dict_args = {}

idx = df_best_params.index[0]
dict_args['dataset'] = df_best_params.loc[idx,'dataset']

dict_args['dataset_kwargs'] = {}
for k, v in df_best_params.filter(regex='pipeline_config.pipeline.dataset_kwargs').loc[idx].items():
    if not np.isnan(v):
        dict_args['dataset_kwargs'][k.split('.')[-1]] = v

dict_args['evaluator'] = df_best_params.loc[idx,'evaluator']

dict_args['evaluator_kwargs'] = {}
for k, v in df_best_params.filter(regex='pipeline_config.pipeline.evaluator_kwargs').loc[idx].items():
    if not np.isnan(v):
        dict_args['evaluator_kwargs'][k.split('.')[-1]] = v

dict_args['model'] = df_best_params.loc[idx,'model']

dict_args['loss'] = df_best_params.loc[idx, 'loss']

dict_args['regularizer'] = df_best_params.loc[idx, 'regularizer']

dict_args['optimizer'] = df_best_params.loc[idx, 'optimizer']

dict_args['optimizer_kwargs'] = {}
for k, v in df_best_params.filter(regex='pipeline_config.pipeline.optimizer_kwargs').loc[idx].items():
    if not np.isnan(v) and 'automatic_memory_optimization' not in k:
        dict_args['optimizer_kwargs'][k.split('.')[-1]] = convert_dtype(v)

dict_args['model_kwargs'] = {}
for k, v in df_best_params.filter(regex='pipeline_config.pipeline.model_kwargs').loc[idx].items():
    if not np.isnan(v) and 'automatic_memory_optimization' not in k:
        k = k.split('.')[-1]
        if k in ['output_channels', 'kernel_height', 'kernel_width']:
            v = int(v)    
        dict_args['model_kwargs'][k] = convert_dtype(v)

dict_args['training_loop'] = df_best_params.loc[idx, 'training_loop']

dict_args['training_kwargs'] = {}
for k, v in df_best_params.filter(regex='pipeline_config.pipeline.training_kwargs').loc[idx].items():
    if not np.isnan(v):
        k = k.split('.')[-1]
        if k in ['batch_size', 'num_epochs']:
            v = int(v)

        if k not in ['label_smoothing']:
            dict_args['training_kwargs'][k.split('.')[-1]] = v

In [11]:
dict_args['optimizer_kwargs']

{'lr': 0.0075250677442329, 'weight_decay': 0.0}

### calclualte embedding

In [13]:
#dict_args['training_kwargs']['num_epochs'] = 5
dict_args['optimizer_kwargs']['lr'] = 0.001
pipeline_result_with_lit = pipeline(
    training=db.get('tlf_train'),
    testing=db.get('tlf_test'),
    validation=db.get('tlf_valid'),
    model=name_model_with_lit,
    evaluator=dict_args['evaluator'],
    evaluator_kwargs = dict_args['evaluator_kwargs'],
    loss = dict_args['loss'],
    model_kwargs = dict_args['model_kwargs'],    
    training_kwargs=dict_args['training_kwargs'],
    optimizer=dict_args['optimizer'],
    optimizer_kwargs=dict_args['optimizer_kwargs'],
    stopper='early',
    stopper_kwargs={'frequency':10, 'patience':2, 'relative_delta':0.002})
'''
pipeline_result_with_lit = pipeline(
    training=db.get('tlf_train'),
    testing=db.get('tlf_test'),
    validation=db.get('tlf_valid'),
    model=name_model_with_lit)
'''

INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /home/acg16558pn/.data/pykeen/checkpoints/best-model-weights-5bdfde6b-2d85-47d2-9e48-edea8763aa94.pt


Training epochs on cuda:0:   0%|          | 0/151 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=256.
INFO:pykeen.evaluation.evaluator:Evaluation took 3.44s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.265491270112975. Saved model weights to /home/acg16558pn/.data/pykeen/checkpoints/best-model-weights-5bdfde6b-2d85-47d2-9e48-edea8763aa94.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.


Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.40s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 20: 0.2890277302293735. Saved model weights to /home/acg16558pn/.data/pykeen/checkpoints/best-model-weights-5bdfde6b-2d85-47d2-9e48-edea8763aa94.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 20.


Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.26s seconds


Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.26s seconds
INFO:pykeen.stoppers.early_stopping:Stopping early at epoch 40. The best result 0.2890277302293735 occurred at epoch 20.
INFO:pykeen.stoppers.early_stopping:Re-loading weights from best epoch from /home/acg16558pn/.data/pykeen/checkpoints/best-model-weights-5bdfde6b-2d85-47d2-9e48-edea8763aa94.pt
INFO:root:When evaluating the test dataset after running the pipeline with early stopping, the validation triples are added to the set of known positive triples which are filtered out when performing filtered evaluation following the approach described by (Bordes et al., 2013).


Evaluating on cuda:0:   0%|          | 0.00/20.4k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.81s seconds


"\npipeline_result_with_lit = pipeline(\n    training=db.get('tlf_train'),\n    testing=db.get('tlf_test'),\n    validation=db.get('tlf_valid'),\n    model=name_model_with_lit)\n"

In [15]:
pipeline_result_without_lit = pipeline(
    training=db.get('tf_train'),
    testing=db.get('tf_test'),
    validation=db.get('tf_valid'),
    model=name_model_without_lit,
    evaluator=dict_args['evaluator'],
    evaluator_kwargs = dict_args['evaluator_kwargs'],
    loss = dict_args['loss'],
    model_kwargs = dict_args['model_kwargs'],    
    training_kwargs=dict_args['training_kwargs'],
    optimizer=dict_args['optimizer'],
    optimizer_kwargs=dict_args['optimizer_kwargs'],
    stopper='early',
    stopper_kwargs={'frequency':10, 'patience':2, 'relative_delta':0.002})

'''
kg = get_dataset(dataset='fb15k237')

pipeline_result_without_lit = pipeline(
    training=kg.training,
    testing=kg.testing,
    validation=kg.validation,
    model=name_model_without_lit,
    )
'''

INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /home/acg16558pn/.data/pykeen/checkpoints/best-model-weights-8ad80b58-7246-4a4c-9ecc-bd7fca6b8547.pt


Training epochs on cuda:0:   0%|          | 0/151 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=256.
INFO:pykeen.evaluation.evaluator:Evaluation took 3.17s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.23199817414127583. Saved model weights to /home/acg16558pn/.data/pykeen/checkpoints/best-model-weights-8ad80b58-7246-4a4c-9ecc-bd7fca6b8547.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.


Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.18s seconds


Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.18s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 30: 0.24669063106242153. Saved model weights to /home/acg16558pn/.data/pykeen/checkpoints/best-model-weights-8ad80b58-7246-4a4c-9ecc-bd7fca6b8547.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 30.


Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.18s seconds


Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/1063 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.17s seconds
INFO:pykeen.stoppers.early_stopping:Stopping early at epoch 50. The best result 0.24669063106242153 occurred at epoch 30.
INFO:pykeen.stoppers.early_stopping:Re-loading weights from best epoch from /home/acg16558pn/.data/pykeen/checkpoints/best-model-weights-8ad80b58-7246-4a4c-9ecc-bd7fca6b8547.pt
INFO:root:When evaluating the test dataset after running the pipeline with early stopping, the validation triples are added to the set of known positive triples which are filtered out when performing filtered evaluation following the approach described by (Bordes et al., 2013).


Evaluating on cuda:0:   0%|          | 0.00/20.4k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.73s seconds


"\nkg = get_dataset(dataset='fb15k237')\n\npipeline_result_without_lit = pipeline(\n    training=kg.training,\n    testing=kg.testing,\n    validation=kg.validation,\n    model=name_model_without_lit,\n    )\n"

In [14]:
pipeline_result_with_lit.get_metric('hits_at_10')

0.3000293570799491

In [16]:
pipeline_result_without_lit.get_metric('hits_at_10')

0.2566542714551326

In [17]:
db_model = DataBinder(target_dir=dir_model)
db_model.add('name_model_with_lit', name_model_with_lit)
db_model.add('name_model_without_lit', name_model_without_lit)
db_model.add('dir_data', dir_data)
db_model.add('model_with_lit', pipeline_result_with_lit.model)
db_model.add('model_without_lit', pipeline_result_without_lit.model)
pipeline_result_with_lit.save_to_directory(f'{dir_model}/model_with_lit')
pipeline_result_without_lit.save_to_directory(f'{dir_model}/model_without_lit')

INFO:root:Create ./models/20240628_distmultliteral
INFO:root:Saved info at 2024-06-27 05:36:30
INFO:root:Saved info at 2024-06-27 05:36:30
INFO:root:Saved info at 2024-06-27 05:36:30
INFO:root:Saved info at 2024-06-27 05:36:30
INFO:root:Saved info at 2024-06-27 05:36:30
INFO:pykeen.triples.triples_factory:Stored TriplesNumericLiteralsFactory(num_entities=14505, num_relations=237, create_inverse_triples=False, num_triples=272115, num_literals=14505) to file:///home/acg16558pn/programs/Analysis-of-conventional-refinement-method-for-knowledge-graph/models/20240628_distmultliteral/model_with_lit/training_triples
INFO:pykeen.pipeline.api:Saved to directory: file:///home/acg16558pn/programs/Analysis-of-conventional-refinement-method-for-knowledge-graph/models/20240628_distmultliteral/model_with_lit
INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=14505, num_relations=237, create_inverse_triples=False, num_triples=272115, path=Release/train.txt) to file:///home/acg16558p