In [4]:
from rdflib import Graph, URIRef, RDF
import pandas as pd
import os

In [25]:
PROJECT_ROOT = os.getcwd()
DATA_DIR = os.path.join(PROJECT_ROOT, "data")


os.makedirs(DATA_DIR, exist_ok=True)

TRAIN_PATH = os.path.join(DATA_DIR, "KG-2022-train.nt")
TEST_PATH  = os.path.join(DATA_DIR, "KG-2022-test.nt")

print("Train file exists:", os.path.exists(TRAIN_PATH))
print("Test file exists:", os.path.exists(TEST_PATH))

Train file exists: True
Test file exists: True


In [17]:
TRUTH_PRED_URI = URIRef("http://swc2017.aksw.org/hasTruthValue")

def load_fact_dataset(path: str, with_labels: bool = True) -> pd.DataFrame:
    """
    Load a reified fact dataset from an N-Triples file.

    Each fact is an rdf:Statement with:
      - rdf:subject
      - rdf:predicate
      - rdf:object
      - optionally: swc2017:hasTruthValue (float label in [0, 1])

    Returns a DataFrame with columns:
      - fact_uri
      - subject
      - predicate
      - object
      - truth (float, if with_labels=True)
    """
    g = Graph()
    g.parse(path, format="nt")
    print(f"Loaded {len(g)} triples from {path}")

    rows = []

    for fact in g.subjects(RDF.type, RDF.Statement):
        s = g.value(fact, RDF.subject)
        p = g.value(fact, RDF.predicate)
        o = g.value(fact, RDF.object)

        if s is None or p is None or o is None:
            continue

        row = {
            "fact_uri": str(fact),
            "subject": str(s),
            "predicate": str(p),
            "object": str(o),
        }

        if with_labels:
            truth_literal = g.value(fact, TRUTH_PRED_URI)
            if truth_literal is None:
                continue
            row["truth"] = float(truth_literal)

        rows.append(row)

    df = pd.DataFrame(rows)
    print("Facts loaded:", df.shape)
    return df


In [18]:
train_df = load_fact_dataset(TRAIN_PATH, with_labels=True)
test_df  = load_fact_dataset(TEST_PATH,  with_labels=False)

display(train_df.head())
display(test_df.head())


Loaded 6170 triples from /Users/codex/Code/course-work/fokg/MiniProject-FoundationsofKnowledgeGraphs/data/KG-2022-train.nt
Facts loaded: (1234, 5)
Loaded 5368 triples from /Users/codex/Code/course-work/fokg/MiniProject-FoundationsofKnowledgeGraphs/data/KG-2022-test.nt
Facts loaded: (1342, 4)


Unnamed: 0,fact_uri,subject,predicate,object,truth
0,http://swc2017.aksw.org/task2/dataset/3226691,http://dbpedia.org/resource/David_Lee_(basketb...,http://dbpedia.org/ontology/team,http://dbpedia.org/resource/Houston_Rockets,0.0
1,http://swc2017.aksw.org/task2/dataset/3320759,http://dbpedia.org/resource/Nenad_Zimonjić,http://dbpedia.org/ontology/award,http://dbpedia.org/resource/Belgrade,0.0
2,http://swc2017.aksw.org/task2/dataset/3642843,http://dbpedia.org/resource/Om_Shanti_Om,http://dbpedia.org/ontology/starring,http://dbpedia.org/resource/Uma_Thurman,0.0
3,http://swc2017.aksw.org/task2/dataset/3800661,http://dbpedia.org/resource/Walt_Whitman,http://dbpedia.org/ontology/deathPlace,"http://dbpedia.org/resource/Camden,_New_Jersey",1.0
4,http://swc2017.aksw.org/task2/dataset/3386366,http://dbpedia.org/resource/François_Jacob,http://dbpedia.org/ontology/award,http://dbpedia.org/resource/Nobel_Prize_in_Phy...,1.0


Unnamed: 0,fact_uri,subject,predicate,object
0,http://swc2017.aksw.org/task2/dataset/3417193,http://dbpedia.org/resource/When_Worlds_Collide,http://dbpedia.org/ontology/author,http://dbpedia.org/resource/Edwin_Balmer
1,http://swc2017.aksw.org/task2/dataset/3812648,http://dbpedia.org/resource/AOL,http://dbpedia.org/ontology/subsidiary,http://dbpedia.org/resource/Quigo
2,http://swc2017.aksw.org/task2/dataset/3883848,http://dbpedia.org/resource/Gary_Cooper,http://dbpedia.org/ontology/deathPlace,"http://dbpedia.org/resource/Waverly,_Minnesota"
3,http://swc2017.aksw.org/task2/dataset/3613044,http://dbpedia.org/resource/Love_Actually,http://dbpedia.org/ontology/starring,http://dbpedia.org/resource/Dennis_Quaid
4,http://swc2017.aksw.org/task2/dataset/3820276,http://dbpedia.org/resource/Christina_Aguilera,http://dbpedia.org/ontology/birthPlace,http://dbpedia.org/resource/Staten_Island


In [19]:
from collections import defaultdict

def learn_frequency_model(train: pd.DataFrame):
    """
    Learn empirical truth probabilities from the training data.
    Returns a dict with:
      - overall_prior
      - p_predicate[p]
      - p_sp[(s,p)]
      - p_po[(p,o)]
    """
    if "truth" not in train.columns:
        raise ValueError("Training DataFrame must contain a 'truth' column.")

    train = train.copy()
    train["truth"] = train["truth"].astype(float)

    overall_prior = train["truth"].mean()

    def build_table(keys):
        counts_pos = defaultdict(float)
        counts_tot = defaultdict(float)
        for _, row in train.iterrows():
            k = tuple(row[kname] for kname in keys)
            counts_pos[k] += row["truth"]
            counts_tot[k] += 1.0
        return {
            k: (counts_pos[k] + 1.0) / (counts_tot[k] + 2.0)
            for k in counts_tot
        }

    p_predicate = build_table(["predicate"])
    p_sp        = build_table(["subject", "predicate"])
    p_po        = build_table(["predicate", "object"])

    model = {
        "overall_prior": overall_prior,
        "p_predicate": p_predicate,
        "p_sp": p_sp,
        "p_po": p_po,
    }
    print("Model trained. Overall prior truth:", overall_prior)
    return model

freq_model = learn_frequency_model(train_df)


Model trained. Overall prior truth: 0.5470016207455429


In [20]:
def score_fact(row, model) -> float:
    """
    Compute a veracity score in [0,1] for a single fact row
    using the learned frequency tables.
    """
    scores = []

    p = row["predicate"]
    s = row["subject"]
    o = row["object"]

    if (p,) in model["p_predicate"]:
        scores.append(model["p_predicate"][(p,)])

    key_sp = (s, p)
    if key_sp in model["p_sp"]:
        scores.append(model["p_sp"][key_sp])

    key_po = (p, o)
    if key_po in model["p_po"]:
        scores.append(model["p_po"][key_po])

    if not scores:
        return float(model["overall_prior"])

    return float(max(scores))


def apply_model(df: pd.DataFrame, model) -> pd.DataFrame:
    df = df.copy()
    df["score"] = df.apply(lambda row: score_fact(row, model), axis=1)
    return df


In [22]:
from sklearn.metrics import roc_auc_score

train_scored = apply_model(train_df, freq_model)
auc = roc_auc_score(train_scored["truth"], train_scored["score"])
print("Training ROC AUC (internal baseline):", auc)


Training ROC AUC (internal baseline): 0.8514596170410124


In [23]:
test_scored = apply_model(test_df, freq_model)
display(test_scored.head())


Unnamed: 0,fact_uri,subject,predicate,object,score
0,http://swc2017.aksw.org/task2/dataset/3417193,http://dbpedia.org/resource/When_Worlds_Collide,http://dbpedia.org/ontology/author,http://dbpedia.org/resource/Edwin_Balmer,0.666667
1,http://swc2017.aksw.org/task2/dataset/3812648,http://dbpedia.org/resource/AOL,http://dbpedia.org/ontology/subsidiary,http://dbpedia.org/resource/Quigo,0.403846
2,http://swc2017.aksw.org/task2/dataset/3883848,http://dbpedia.org/resource/Gary_Cooper,http://dbpedia.org/ontology/deathPlace,"http://dbpedia.org/resource/Waverly,_Minnesota",0.659794
3,http://swc2017.aksw.org/task2/dataset/3613044,http://dbpedia.org/resource/Love_Actually,http://dbpedia.org/ontology/starring,http://dbpedia.org/resource/Dennis_Quaid,0.666667
4,http://swc2017.aksw.org/task2/dataset/3820276,http://dbpedia.org/resource/Christina_Aguilera,http://dbpedia.org/ontology/birthPlace,http://dbpedia.org/resource/Staten_Island,0.690217


In [24]:
XSD_DOUBLE_URI = "http://www.w3.org/2001/XMLSchema#double"
TRUTH_PRED_STR = "http://swc2017.aksw.org/hasTruthValue"

RESULT_PATH = os.path.join(PROJECT_ROOT, "result.ttl")

with open(RESULT_PATH, "w", encoding="utf-8") as f:
    for _, row in test_scored.iterrows():
        fact_uri = row["fact_uri"]
        val = float(row["score"])
        line = (
            f"<{fact_uri}> "
            f"<{TRUTH_PRED_STR}> "
            f"\"{val:.6f}\"^^<{XSD_DOUBLE_URI}> .\n"
        )
        f.write(line)

print("Wrote GERBIL result file to:", RESULT_PATH)


Wrote GERBIL result file to: /Users/codex/Code/course-work/fokg/MiniProject-FoundationsofKnowledgeGraphs/result.ttl
