# Write optimized parameters for learning knowledge graph embedding
For optimized parameter, please refer to [this github page](https://github.com/pykeen/benchmarking).


## modules

In [12]:
from pykeen.pipeline import pipeline
from pykeen.datasets import get_dataset
import pandas as pd
import numpy as np
import yaml
import os
from util.databinder import DataBinder 

## functions

In [2]:
def convert_dtype(value):
    if isinstance(value, np.floating):
        return float(value)
    elif isinstance(value, np.integer):
        return int(value)
    else:
        return value

## parameters

In [3]:
dir_output = './tmp/20240627_params_for_learn_kge'
random_seeds = [1,2,3,4,5,6,7,8,9,10]

## main

In [4]:
f_params = '../benchmarking/df_best_param.pkl'
df_best_params_org = pd.read_pickle(f_params).reset_index()

In [5]:
df_best_params = df_best_params_org[df_best_params_org['model'].isin(['transe'])]

In [6]:
df_best_params = df_best_params[df_best_params['dataset'].isin(['fb15k237', 'kinships', 'wn18rr'])]

In [7]:
df_best_params 

Unnamed: 0,index,dataset,evaluator,loss,metric,model,optimizer,pykeen_git_hash,pykeen_version,regularizer,...,pipeline_config.pipeline.model_kwargs.embedding_height,pipeline_config.pipeline.model_kwargs.embedding_width,pipeline_config.pipeline.model_kwargs.input_channels,pipeline_config.pipeline.model_kwargs.kernel_height,pipeline_config.pipeline.model_kwargs.kernel_width,hpo.pipeline.model_kwargs.input_channels,hpo.pipeline.model_kwargs.embedding_height,hpo.pipeline.model_kwargs.embedding_width,hpo.pipeline.model_kwargs.kernel_height,hpo.pipeline.model_kwargs.kernel_width
0,0,fb15k237,rankbased,crossentropy,hits@10,transe,adam,ec6b0751,0.1.2-dev,no,...,,,,,,,,,,
1,0,kinships,rankbased,crossentropy,hits@10,transe,adadelta,c414b0a6,0.1.2-dev,no,...,,,,,,,,,,
2,0,wn18rr,rankbased,softplus,hits@10,transe,adam,56e46b3e,0.1.2-dev,no,...,,,,,,,,,,


In [8]:
df_best_params.filter(regex='hpo.pipeline.stopper').T

Unnamed: 0,0,1,2
hpo.pipeline.stopper,early,early,early
hpo.pipeline.stopper_kwargs.frequency,50,50,50
hpo.pipeline.stopper_kwargs.patience,2,2,2
hpo.pipeline.stopper_kwargs.delta,0.002,0.002,0.002


In [9]:
list_args = []
for idx in df_best_params.index:
    for random_seed in random_seeds:
    
        dict_args = {}
        
        dict_args['dataset'] = df_best_params.loc[idx,'dataset']
        
        dict_args['dataset_kwargs'] = {}
        for k, v in df_best_params.filter(regex='pipeline_config.pipeline.dataset_kwargs').loc[idx].items():
            if not np.isnan(v):
                dict_args['dataset_kwargs'][k.split('.')[-1]] = v
    
        dict_args['evaluator'] = df_best_params.loc[idx,'evaluator']
    
        dict_args['evaluator_kwargs'] = {}
        for k, v in df_best_params.filter(regex='pipeline_config.pipeline.evaluator_kwargs').loc[idx].items():
            if not np.isnan(v):
                dict_args['evaluator_kwargs'][k.split('.')[-1]] = v
        
        dict_args['model'] = df_best_params.loc[idx,'model']
    
        dict_args['loss'] = df_best_params.loc[idx, 'loss']
    
        dict_args['regularizer'] = df_best_params.loc[idx, 'regularizer']
    
        dict_args['optimizer'] = df_best_params.loc[idx, 'optimizer']
    
        dict_args['optimizer_kwargs'] = {}
        for k, v in df_best_params.filter(regex='pipeline_config.pipeline.optimizer_kwargs').loc[idx].items():
            if not np.isnan(v) and 'automatic_memory_optimization' not in k:
                dict_args['optimizer_kwargs'][k.split('.')[-1]] = convert_dtype(v)
        
        dict_args['model_kwargs'] = {}
        for k, v in df_best_params.filter(regex='pipeline_config.pipeline.model_kwargs').loc[idx].items():
            if not np.isnan(v) and 'automatic_memory_optimization' not in k:
                k = k.split('.')[-1]
                if k in ['output_channels', 'kernel_height', 'kernel_width']:
                    v = int(v)    
                dict_args['model_kwargs'][k] = convert_dtype(v)
    
        dict_args['training_loop'] = df_best_params.loc[idx, 'training_loop']
    
        dict_args['training_kwargs'] = {}
        for k, v in df_best_params.filter(regex='pipeline_config.pipeline.training_kwargs').loc[idx].items():
            if not np.isnan(v):
                k = k.split('.')[-1]
                if k in ['batch_size', 'num_epochs']:
                    v = int(v)
    
                if k not in ['label_smoothing']:
                    dict_args['training_kwargs'][k.split('.')[-1]] = v

        dict_args['random_seed'] = random_seed
    
        list_args.append(dict_args)

In [10]:
db = DataBinder(target_dir=dir_output)
db.add('params',list_args)

INFO:root:Loaded info from ./tmp/20240627_params_for_learn_kge/info.json
INFO:root:Saved info at 2024-06-27 05:45:24


'./tmp/20240627_params_for_learn_kge/params.pt'

In [13]:
for i, dict_args in enumerate(list_args):
    with open(f'./params/{i}.yaml', 'w') as fout:
        yaml.dump(dict_args, fout)