# Create Knowledge Graph and Its Features Table for Analysis

## modules

In [1]:
import numpy as np
import torch
import random
import spacy
import pandas as pd
from pykeen.pipeline import pipeline
from pykeen.datasets import get_dataset
from pykeen.models import TransE, ComplEx, ConvE
from pykeen.losses import Loss
from pykeen.models.base import Model
from pykeen.triples import TriplesFactory
from pykeen.triples import TriplesNumericLiteralsFactory
from tqdm import tqdm
from util.databinder import DataBinder

  from tqdm.autonotebook import tqdm


## variable, functions, classes

In [2]:
def sample_triples(tf:TriplesFactory, ratio:float=0.01) -> TriplesFactory:
    num_samples = int(tf.num_triples * ratio)
    sampled_indices = np.random.choice(tf.num_triples, size=num_samples, replace=False)
    sampled_tf = TriplesFactory.from_labeled_triples(tf.triples[sampled_indices])
    return sampled_tf

def create_false_triples(tf:TriplesFactory, ratio:float=0.1) -> TriplesFactory:
    num_false = int(tf.num_triples * ratio)
    false_triples = tf.mapped_triples.clone()
    false_indices = np.random.choice(tf.num_triples, num_false)
    for i in false_indices:
        if np.random.random() < 0.5:  # Replace head
            false_triples[i, 0] = np.random.choice(tf.num_entities)
        else:  # Replace tail
            false_triples[i, 2] = np.random.choice(tf.num_entities)
    return TriplesFactory(false_triples, tf.entity_to_id, tf.relation_to_id), false_indices

def create_triples_feature_table(tf:TriplesFactory) -> pd.DataFrame:
    triples = tf.mapped_triples.numpy()
    head_labels = [tf.entity_id_to_label[h] for h in triples[:, 0]]
    relation_labels = [tf.relation_id_to_label[r] for r in triples[:, 1]]
    tail_labels = [tf.entity_id_to_label[t] for t in triples[:, 2]]
    
    # Calculate degrees
    head_degrees = np.array([np.sum(triples[:, 0] == h) for h in triples[:, 0]])
    tail_degrees = np.array([np.sum(triples[:, 2] == t) for t in triples[:, 2]])
    
    return pd.DataFrame({
        'head': head_labels,
        'relation': relation_labels,
        'tail': tail_labels,
        'head_degree': head_degrees,
        'tail_degree': tail_degrees,
    })

## Parameters

In [3]:
name_kg = 'Wikidata5M'
sampling_ratio = 1e-3
f_sr_description = './data/raw/sr_wikidata5m_text.pkl'
dir_save = './data/processed/20240616_sampled_wikidata_5m'
name_nlp = 'en_core_web_lg'

## 1. Load knowledge graph

In [4]:
dataset = get_dataset(dataset=name_kg)

In [5]:
dataset.summarize()

EagerDataset (create_inverse_triples=False)
Name        Entities    Relations      Triples
----------  ----------  -----------  ---------
Training    4594149     822           20614279
Testing     4594149     822               4977
Validation  4594149     822               4983
Total       -           -             20624239
Head    Relation    tail
------  ----------  --------
Q1      P1343       Q602358
Q1      P1419       Q1647152
Q1      P1552       Q11412
Q1      P2184       Q136407
Q1      P2670       Q18343



Use only training data set

In [6]:
tf_org = dataset.training

## 2. Sampling Knowledge Graph

In [7]:
tf = sample_triples(tf_org, ratio=sampling_ratio)

Reconstructing all label-based triples. This is expensive and rarely needed.


## 3. Create TriplesNumericLiteralsFactory

For detail, please see [TriplesNumericLiteralsFactory](https://pykeen.readthedocs.io/en/stable/reference/triples.html#pykeen.triples.TriplesNumericLiteralsFactory)

In [8]:
n_entity = len(tf.entity_id_to_label)

nlp = spacy.load(name_nlp)
dim_embedding = nlp.vocab.vectors_length

numeric_literals = np.zeros((n_entity, dim_embedding))

sr_description = pd.read_pickle(f_sr_description)
for _id, _label in tqdm(tf.entity_id_to_label.items()):
    text  = sr_description.loc[_label]
    numeric_literals[_id, :] = nlp(text).vector

100%|█████████████████████████████████████████████████████████████████████████████| 29642/29642 [08:29<00:00, 58.20it/s]


In [9]:
tlf = TriplesNumericLiteralsFactory(mapped_triples = tf.mapped_triples, 
                                    entity_to_id = tf.entity_to_id, 
                                    relation_to_id = tf.relation_to_id,
                                    numeric_literals=numeric_literals,
                                    literals_to_id=tf.entity_to_id
                                   )

## 4. Save data

In [12]:
db = DataBinder(target_dir=dir_save)
db.add('tf', tf)
db.add('tlf', tlf)
db.add('sampling_ratio', sampling_ratio)
db.add('embedding model', name_nlp)
db.add('knowledge_graph_name', name_kg)

'Wikidata5M'