In [1]:
import os
from collections import defaultdict

import numpy as np
import pandas as pd
from rdflib import Graph, URIRef, RDF
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [18]:
PROJECT_ROOT = os.getcwd()
DATA_DIR = os.path.join(PROJECT_ROOT, "data");

TRAIN_FILE = os.path.join(DATA_DIR, "KG-2022-train.nt")
TEST_FILE  = os.path.join(DATA_DIR, "KG-2022-test.nt")
RESULT_FILE = os.path.join(PROJECT_ROOT, "result_ml.ttl")

TRUTH_PRED_URI = URIRef("http://swc2017.aksw.org/hasTruthValue")
TRUTH_PRED_STR = "http://swc2017.aksw.org/hasTruthValue"
XSD_DOUBLE_URI = "http://www.w3.org/2001/XMLSchema#double"

In [19]:
def load_fact_dataset(path: str, with_labels: bool = True) -> pd.DataFrame:
    """
    Load a reified fact dataset from an N-Triples file.

    Each fact is an rdf:Statement with:
      - rdf:subject
      - rdf:predicate
      - rdf:object
      - optionally: swc2017:hasTruthValue (float label in [0, 1])

    Returns a DataFrame with columns:
      - fact_uri
      - subject
      - predicate
      - object
      - truth (float, if with_labels=True)
    """
    print(f"[INFO] Loading dataset from {path}")
    g = Graph()
    g.parse(path, format="nt")
    print(f"[INFO] Parsed {len(g)} RDF triples")

    rows = []

    for fact in g.subjects(RDF.type, RDF.Statement):
        s = g.value(fact, RDF.subject)
        p = g.value(fact, RDF.predicate)
        o = g.value(fact, RDF.object)

        if s is None or p is None or o is None:
            continue

        row = {
            "fact_uri": str(fact),
            "subject": str(s),
            "predicate": str(p),
            "object": str(o),
        }

        if with_labels:
            truth_literal = g.value(fact, TRUTH_PRED_URI)
            if truth_literal is None:
                # training instances without labels are skipped
                continue
            row["truth"] = float(truth_literal)

        rows.append(row)

    df = pd.DataFrame(rows)
    print(f"[INFO] Facts loaded: {df.shape[0]} rows")
    return df

In [20]:
def compute_stats(train: pd.DataFrame) -> dict:
    """
    Compute frequency and count statistics from the training data.

    Returns a dict with:
      - overall_prior
      - freq_predicate, cnt_predicate
      - freq_sp,        cnt_sp
      - freq_po,        cnt_po
      - deg_subject,    deg_object
    """
    stats = {}
    train = train.copy()
    train["truth"] = train["truth"].astype(float)

    stats["overall_prior"] = train["truth"].mean()

    def freq_and_count(keys):
        pos = defaultdict(float)
        tot = defaultdict(float)
        for _, row in train.iterrows():
            k = tuple(row[k] for k in keys)
            pos[k] += row["truth"]
            tot[k] += 1.0
        # Laplace smoothing: (pos + 1) / (tot + 2)
        freq = {k: (pos[k] + 1.0) / (tot[k] + 2.0) for k in tot}
        return freq, tot

    # per predicate
    freq_predicate, cnt_predicate = freq_and_count(["predicate"])
    # per (subject, predicate)
    freq_sp, cnt_sp = freq_and_count(["subject", "predicate"])
    # per (predicate, object)
    freq_po, cnt_po = freq_and_count(["predicate", "object"])

    stats["freq_predicate"] = freq_predicate
    stats["cnt_predicate"]  = cnt_predicate
    stats["freq_sp"]        = freq_sp
    stats["cnt_sp"]         = cnt_sp
    stats["freq_po"]        = freq_po
    stats["cnt_po"]         = cnt_po

    # degrees of subjects and objects (how often they appear)
    deg_subject = defaultdict(int)
    deg_object  = defaultdict(int)
    for _, row in train.iterrows():
        deg_subject[row["subject"]] += 1
        deg_object[row["object"]]  += 1

    stats["deg_subject"] = deg_subject
    stats["deg_object"]  = deg_object

    print(f"[INFO] Stats computed. Overall prior truth: {stats['overall_prior']:.4f}")
    return stats


def _get_or_default(d, key, default):
    return float(d[key]) if key in d else float(default)


def features_for_row(row: pd.Series, stats: dict) -> list[float]:
    """
    Construct the feature vector for a single fact.
    """
    p = row["predicate"]
    s = row["subject"]
    o = row["object"]

    # base frequencies
    f_pred = _get_or_default(stats["freq_predicate"], (p,), stats["overall_prior"])
    f_sp   = _get_or_default(stats["freq_sp"],        (s, p), f_pred)
    f_po   = _get_or_default(stats["freq_po"],        (p, o), f_pred)

    # counts / supports
    c_pred = _get_or_default(stats["cnt_predicate"], (p,), 0.0)
    c_sp   = _get_or_default(stats["cnt_sp"],        (s, p), 0.0)
    c_po   = _get_or_default(stats["cnt_po"],        (p, o), 0.0)

    # degrees
    deg_s = _get_or_default(stats["deg_subject"], s, 0.0)
    deg_o = _get_or_default(stats["deg_object"],  o, 0.0)

    return [
        f_pred,   # 0 frequency per predicate
        f_sp,     # 1 frequency per (subject, predicate)
        f_po,     # 2 frequency per (predicate, object)
        c_pred,   # 3 count per predicate
        c_sp,     # 4 count per (subject, predicate)
        c_po,     # 5 count per (predicate, object)
        deg_s,    # 6 degree of subject
        deg_o     # 7 degree of object
    ]


def make_feature_matrix(df: pd.DataFrame, stats: dict) -> np.ndarray:
    """
    Build the feature matrix X for all rows in df.
    """
    X = np.vstack([features_for_row(row, stats) for _, row in df.iterrows()])
    return X

In [21]:
def train_ml_model(train_df: pd.DataFrame, stats: dict):
    """
    Train a logistic regression model on engineered features.
    Returns the fitted sklearn Pipeline.
    """
    X_train = make_feature_matrix(train_df, stats)
    y_train = train_df["truth"].astype(float).values

    print("[INFO] Training Logistic Regression model...")
    pipeline = Pipeline(
        steps=[
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(
                penalty="l2",
                C=1.0,
                solver="lbfgs",
                max_iter=1000
            ))
        ]
    )

    pipeline.fit(X_train, y_train)
    proba_train = pipeline.predict_proba(X_train)[:, 1]
    auc = roc_auc_score(y_train, proba_train)
    print(f"[INFO] Training ROC AUC (LogReg): {auc:.4f}")

    return pipeline


def score_test_set(test_df: pd.DataFrame, stats: dict, model) -> pd.DataFrame:
    """
    Apply the trained model to the test set and return a DataFrame
    with 'fact_uri' and 'score' columns.
    """
    X_test = make_feature_matrix(test_df, stats)
    proba_test = model.predict_proba(X_test)[:, 1]

    out = test_df[["fact_uri"]].copy()
    out["score"] = proba_test.astype(float)
    return out


In [22]:

def write_result_ttl(scored_df: pd.DataFrame, path: str):
    """
    Write a GERBIL-compatible TTL file:

      <Fact-URI> <http://swc2017.aksw.org/hasTruthValue> "value"^^<http://www.w3.org/2001/XMLSchema#double> .
    """
    print(f"[INFO] Writing result file to {path}")
    with open(path, "w", encoding="utf-8") as f:
        for _, row in scored_df.iterrows():
            fact_uri = row["fact_uri"]
            val = float(row["score"])
            line = (
                f"<{fact_uri}> "
                f"<{TRUTH_PRED_STR}> "
                f"\"{val:.6f}\"^^<{XSD_DOUBLE_URI}> .\n"
            )
            f.write(line)
    print("[INFO] Finished writing result file.")

In [23]:
def main():
    if not os.path.exists(TRAIN_FILE):
        raise FileNotFoundError(f"Training file not found: {TRAIN_FILE}")
    if not os.path.exists(TEST_FILE):
        raise FileNotFoundError(f"Test file not found: {TEST_FILE}")

    # 1) Load data
    train_df = load_fact_dataset(TRAIN_FILE, with_labels=True)
    test_df  = load_fact_dataset(TEST_FILE,  with_labels=False)

    # 2) Compute statistics for feature engineering
    stats = compute_stats(train_df)

    # 3) Train ML model
    model = train_ml_model(train_df, stats)

    # 4) Score test set
    scored_test = score_test_set(test_df, stats, model)
    print("[INFO] Example scores:")
    print(scored_test.head())

    # 5) Write GERBIL-compatible TTL
    write_result_ttl(scored_test, RESULT_FILE)

In [24]:
main()

[INFO] Loading dataset from /Users/codex/Code/course-work/fokg/MiniProject-FoundationsofKnowledgeGraphs/data/KG-2022-train.nt
[INFO] Parsed 6170 RDF triples
[INFO] Facts loaded: 1234 rows
[INFO] Loading dataset from /Users/codex/Code/course-work/fokg/MiniProject-FoundationsofKnowledgeGraphs/data/KG-2022-test.nt
[INFO] Parsed 5368 RDF triples
[INFO] Facts loaded: 1342 rows
[INFO] Stats computed. Overall prior truth: 0.5470
[INFO] Training Logistic Regression model...
[INFO] Training ROC AUC (LogReg): 0.9736
[INFO] Example scores:
                                        fact_uri     score
0  http://swc2017.aksw.org/task2/dataset/3417193  0.962194
1  http://swc2017.aksw.org/task2/dataset/3812648  0.042155
2  http://swc2017.aksw.org/task2/dataset/3883848  0.997382
3  http://swc2017.aksw.org/task2/dataset/3613044  0.942811
4  http://swc2017.aksw.org/task2/dataset/3820276  0.998952
[INFO] Writing result file to /Users/codex/Code/course-work/fokg/MiniProject-FoundationsofKnowledgeGraphs/resul

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
