# Create Knowledge Graph and Its Features Table for Analysis

## modules

In [1]:
import numpy as np
import torch
import random
import json
import pandas as pd
from tqdm import tqdm
from pykeen.pipeline import pipeline
from pykeen.datasets import get_dataset
from pykeen.models import TransE, ComplEx, ConvE
from pykeen.losses import Loss
from pykeen.models.base import Model
from pykeen.triples import TriplesFactory

from util.databinder import DataBinder

  from tqdm.autonotebook import tqdm


## variable, functions, classes

In [2]:
def create_false_triples(tf:TriplesFactory, ratio:float=0.1, random_seed:int=0) -> TriplesFactory:

    # fix random seed
    np.random.seed(random_seed)
    
    num_false = int(tf.num_triples * ratio)
    false_triples = tf.mapped_triples.clone()
    false_indices = np.random.choice(tf.num_triples, num_false)
    for i in false_indices:
        if np.random.random() < 0.5:  # Replace head
            false_triples[i, 0] = np.random.choice(tf.num_entities)
        else:  # Replace tail
            false_triples[i, 2] = np.random.choice(tf.num_entities)
    return TriplesFactory(false_triples, tf.entity_to_id, tf.relation_to_id, create_inverse_triples=tf.create_inverse_triples), false_indices

def create_triples_feature_table(tf:TriplesFactory) -> pd.DataFrame:
    
    triples = tf.mapped_triples.numpy()
    head_labels = [tf.entity_id_to_label[h] for h in triples[:, 0]]
    relation_labels = [tf.relation_id_to_label[r] for r in triples[:, 1]]
    tail_labels = [tf.entity_id_to_label[t] for t in triples[:, 2]]
    
    # Calculate degrees
    head_degrees = np.array([np.sum(triples[:, 0] == h) for h in triples[:, 0]])
    tail_degrees = np.array([np.sum(triples[:, 2] == t) for t in triples[:, 2]])
    
    return pd.DataFrame({
        'head': head_labels,
        'relation': relation_labels,
        'tail': tail_labels,
        'head_degree': head_degrees,
        'tail_degree': tail_degrees,
    })

## Parameters

In [3]:
dir_model = './models/20240628_distmultliteral'
false_ratio = 0.1
list_random_seed = [1,2,3,4,5,6,7,8,9,10]
dir_save = './data/processed/20240628_false_fb15k237_with_lit'

## 1. Load knowledge graph

In [4]:
db_model = DataBinder(target_dir=dir_model)

INFO:root:Loaded info from ./models/20240628_distmultliteral/info.json


In [4]:
dataset = get_dataset(dataset=name_kg,dataset_kwargs={'create_inverse_triples':True})

INFO:pykeen.datasets.utils:Loading cached preprocessed dataset from file:///home/acg16558pn/.data/pykeen/datasets/kinships/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
INFO:pykeen.triples.triples_factory:Loading from file:///home/acg16558pn/.data/pykeen/datasets/kinships/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/training
INFO:pykeen.triples.triples_factory:Loading from file:///home/acg16558pn/.data/pykeen/datasets/kinships/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/testing
INFO:pykeen.triples.triples_factory:Loading from file:///home/acg16558pn/.data/pykeen/datasets/kinships/cache/47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM/validation


In [5]:
dataset.summarize()

EagerDataset (create_inverse_triples=True)
Name        Entities    Relations      Triples
----------  ----------  -----------  ---------
Training    104         50                8544
Testing     104         25                1074
Validation  104         25                1068
Total       -           -                10686
Head     Relation    tail
-------  ----------  --------
person0  term0       person45
person0  term10      person51
person0  term10      person52
person0  term10      person57
person0  term10      person58



## 2. Create false triple and its feature table

In [6]:
#tf_tt = dataset.testing
tf_tt = dataset.training
df_tt_features = create_triples_feature_table(tf_tt)

In [7]:
dict_data = {}
for random_seed in tqdm(list_random_seed):
    
    tf_tf, false_indices = create_false_triples(tf_tt, ratio=false_ratio, random_seed=random_seed)
    
    df_tf_features = create_triples_feature_table(tf_tf)
    
    df1 = df_tt_features.copy(deep=True)
    df1.rename(columns={
        'head':'head(org)',
        'relation':'relation(org)',
        'tail':'tail(org)',
        'head_degree':'head_degree(org)',
        'tail_degree':'tail_degree(org)'},
        inplace=True)
    df2 = df_tf_features.copy(deep=True)
    df_tt_tf_features = pd.concat([df1, df2], axis=1)
    df_tt_tf_features['is-error'] = [(True) if (idx in false_indices) else (False) for idx in df_tt_tf_features.index]
    df_tt_tf_features['degree'] = df_tt_tf_features['head_degree'] + df_tt_tf_features['tail_degree']

    dict_data[random_seed] = {}
    dict_data[random_seed]['tf'] = tf_tf
    dict_data[random_seed]['df_feature'] = df_tt_tf_features

100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  3.99it/s]


## 3. Save data

In [8]:
db = DataBinder(target_dir=dir_save)
db.add('false_ratio', false_ratio)
db.add('list_random_seed', list_random_seed)
db.add('name_kg', name_kg)
for random_seed in dict_data.keys():
    db.add(f'tf_{random_seed}', dict_data[random_seed]['tf'])
    db.add(f'df_tt_tf_features_{random_seed}', dict_data[random_seed]['df_feature'])

INFO:root:Create ./data/processed/20240622_false_kinships_based_on_training_data
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:28:31
INFO:root:Saved info at 2024-06-22 07:2