### Techniques to convert KGC in plain text

We will follow [From Discrimination to Generation: Knowledge Graph Completion with Generative Transformer](https://arxiv.org/pdf/2202.02113.pdf).

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [None]:
cd ..

In [None]:
# Load data
from src.utils import load_fb15k237, load_wn18rr, get_hist

PATH_FB15k237 = "data/datasets_knowledge_embedding/FB15k-237"
PATH_WN18RR = "data/datasets_knowledge_embedding/WN18RR/text"
PATH_FB15k = "data/datasets_knowledge_embedding/FB15k"
PATH_WN18 = "data/datasets_knowledge_embedding/WN18/text"

train_fb15k, valid_fb15k, test_fb15k, _ = load_fb15k237(PATH_FB15k)

train_fb15k237, valid_fb15k237, test_fb15k237, entity2wikidata = load_fb15k237(
    PATH_FB15k237
)

train_wn18, valid_wn18, test_wn18 = load_wn18rr(PATH_WN18)

train_wn18rr, valid_wn18rr, test_wn18rr = load_wn18rr(PATH_WN18RR)

In [None]:
# Filter data without entity description
import pandas as pd

pd.options.mode.copy_on_write = True

# FB15k
all_data_fb15k = pd.concat([train_fb15k, valid_fb15k, test_fb15k], axis=0)

df_entity = pd.DataFrame(entity2wikidata.keys(), columns=["head"])

all_data_fb15k = all_data_fb15k[
    all_data_fb15k["head"].isin(df_entity["head"])
    & all_data_fb15k["tail"].isin(df_entity["head"])
]

# FB15k-237
all_data_fb15k237 = pd.concat([train_fb15k237, valid_fb15k237, test_fb15k237], axis=0)

all_data_fb15k237 = all_data_fb15k237[
    all_data_fb15k237["head"].isin(df_entity["head"])
    & all_data_fb15k237["tail"].isin(df_entity["head"])
]

# WN18
all_data_wn18 = pd.concat([train_wn18, valid_wn18, test_wn18], axis=0)

# WN18RR
all_data_wn18rr = pd.concat([train_wn18rr, valid_wn18rr, test_wn18rr], axis=0)

In [None]:
def map_relation_to_text(relation):
    return "has " + relation.split("/")[-1].replace("_", " ") + " of"

In [None]:
# FB15k
all_data_fb15k["head_text"] = all_data_fb15k["head"].apply(
    lambda i: entity2wikidata[i]["label"]
)
all_data_fb15k["relation_text"] = all_data_fb15k["relation"].apply(
    lambda i: map_relation_to_text(i)
)
all_data_fb15k["tail_text"] = all_data_fb15k["tail"].apply(
    lambda i: entity2wikidata[i]["label"]
)

all_data_fb15k["text"] = (
    all_data_fb15k["head_text"]
    + " "
    + all_data_fb15k["relation_text"]
    + " "
    + all_data_fb15k["tail_text"]
    + "."
)

# FB15k-237
all_data_fb15k237["head_text"] = all_data_fb15k237["head"].apply(
    lambda i: entity2wikidata[i]["label"]
)
all_data_fb15k237["relation_text"] = all_data_fb15k237["relation"].apply(
    lambda i: map_relation_to_text(i)
)
all_data_fb15k237["tail_text"] = all_data_fb15k237["tail"].apply(
    lambda i: entity2wikidata[i]["label"]
)

all_data_fb15k237["text"] = (
    all_data_fb15k237["head_text"]
    + " "
    + all_data_fb15k237["relation_text"]
    + " "
    + all_data_fb15k237["tail_text"]
    + "."
)

# WN18

from nltk.corpus import wordnet as wn

all_data_wn18["head_text"] = all_data_wn18["head"].apply(
    lambda i: " ".join(wn.synset(i).lemmas()[0].name().split("_"))
)

all_data_wn18["relation_text"] = all_data_wn18["relation"].apply(
    lambda i: map_relation_to_text(" ".join(i.split("_")))
)

all_data_wn18["tail_text"] = all_data_wn18["tail"].apply(
    lambda i: " ".join(wn.synset(i).lemmas()[0].name().split("_"))
)

# WN18RR

all_data_wn18rr["head_text"] = all_data_wn18rr["head"].apply(
    lambda i: " ".join(wn.synset(i).lemmas()[0].name().split("_"))
)

all_data_wn18rr["relation_text"] = all_data_wn18rr["relation"].apply(
    lambda i: map_relation_to_text(" ".join(i.split("_")))
)

all_data_wn18rr["tail_text"] = all_data_wn18rr["tail"].apply(
    lambda i: " ".join(wn.synset(i).lemmas()[0].name().split("_"))
)

In [None]:
all_data = pd.concat(
    [all_data_fb15k, all_data_fb15k237, all_data_wn18, all_data_wn18rr], axis=0
)
all_data = all_data[["head_text", "relation_text", "tail_text", "text"]]

all_data.drop_duplicates(inplace=True)

all_data["id"] = all_data.index

In [None]:
import tqdm
from tqdm.auto import tqdm

tqdm.pandas()


def train_demonstration_generator(row):
    # select 2 samples of rows with same relation.
    # generate triple of select rows
    # contat triples and return

    # other heuristic is split all relations e groups of 3 and use 2 for demonstration 1 to fill

    to_fill = row.head_text + " " + row.relation_text + " "

    try:
        return " ".join(
            all_data[
                (all_data["relation_text"] == row.relation_text)
                & (all_data["id"] != row.id)
            ]
            .sample(2, random_state=42)["text"]
            .to_list()
            + [to_fill]
        )
    except:
        return ""

In [None]:
%%time

import swifter

all_data["demonstration_input"] = all_data.swifter.apply(
    lambda row: train_demonstration_generator(row), axis=1
)

In [None]:
import os
os.makedirs("data/data_processed/FB15k_FB15k237_WN18_WN18RR/", exist_ok=True)

all_data.to_csv(
    "data/data_processed/FB15k_FB15k237_WN18_WN18RR/processed_data.csv", index=False
)