### Techniques to convert KGC in plain text

We will follow [From Discrimination to Generation: Knowledge Graph Completion with Generative Transformer](https://arxiv.org/pdf/2202.02113.pdf).

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [None]:
cd ..

In [None]:
# Load data
from src.utils import load_fb15k237, load_wn18rr, get_hist

PATH_FB15k237 = "data/datasets_knowledge_embedding/FB15k-237"

train, valid, test, entity2wikidata = load_fb15k237(PATH_FB15k237)

In [None]:
# Filter data without entity description
import pandas as pd

pd.options.mode.copy_on_write = True

all_data_fb = pd.concat([train, valid, test], axis=0)

df_entity = pd.DataFrame(entity2wikidata.keys(), columns=["head"])

all_data_fb_filtered = all_data_fb[
    all_data_fb["head"].isin(df_entity["head"])
    & all_data_fb["tail"].isin(df_entity["head"])
]

In [None]:
def map_relation_to_text(relation):
    return "has " + relation.split("/")[-1].replace("_", " ") + " of"

In [None]:
all_data_fb_filtered["head_text"] = all_data_fb_filtered["head"].apply(
    lambda i: entity2wikidata[i]["label"]
)
all_data_fb_filtered["relation_text"] = all_data_fb_filtered["relation"].apply(
    lambda i: map_relation_to_text(i)
)
all_data_fb_filtered["tail_text"] = all_data_fb_filtered["tail"].apply(
    lambda i: entity2wikidata[i]["label"]
)

all_data_fb_filtered["text"] = (
    all_data_fb_filtered["head_text"]
    + " "
    + all_data_fb_filtered["relation_text"]
    + " "
    + all_data_fb_filtered["tail_text"]
    + "."
)

all_data_fb_filtered["id"] = all_data_fb_filtered.index

In [None]:
all_data_fb_filtered

In [None]:
cd ..

In [None]:
# correction of "has" without space.
import pandas as pd

PATH_FB15k237 = "data/datasets_knowledge_embedding/FB15k-237"

processed_data = pd.read_csv(PATH_FB15k237 + "/processed_data.csv")

all_data_fb_filtered = processed_data

In [None]:
import tqdm
from tqdm.auto import tqdm

tqdm.pandas()


def train_demonstration_generator(row):
    # select 2 samples of rows with same relation.
    # generate triple of select rows
    # contat triples and return

    # other heuristic is split all relations e groups of 3 and use 2 for demonstration 1 to fill

    
    to_fill = row.head_text + " " + row.relation_text + " "
    return " ".join(
        all_data_fb_filtered[
            (all_data_fb_filtered["relation"] == row.relation)
            & (all_data_fb_filtered["id"] != row.id)
        ]
        .sample(2, random_state=42)["text"]
        .to_list()
        + [to_fill]
    )



In [None]:
import swifter

all_data_fb_filtered["demonstration_input"] = all_data_fb_filtered.swifter.apply(
    lambda row: train_demonstration_generator(row), axis=1
)

In [None]:
all_data_fb_filtered.to_csv(
    "data/datasets_knowledge_embedding/FB15k-237/processed_data_v2.csv", index=False
)

In [None]:
pd.read_csv("data/datasets_knowledge_embedding/FB15k-237/processed_data_v2.csv").head(10)