In [None]:
import requests
import re
import json
import time
from tqdm import tqdm

def extract_ids(triples):
    """
    Extract unique Q and P IDs from subjects, predicates, and objects in triples.
    """
    ids = set()
    pattern = re.compile(r'(Q\d+|P\d+)$')
    
    for triple in triples:
        for key in ['subject', 'predicate', 'object']:
            val = triple.get(key)
            if isinstance(val, str) and val.startswith("http://www.wikidata.org/"):
                match = pattern.search(val)
                if match:
                    ids.add(match.group(1))
    return list(ids)

def fetch_labels(ids):
    """
    Fetch English labels for a list of Wikidata IDs (Q or P) using the Wikidata API.
    """
    url = "https://www.wikidata.org/w/api.php"
    id_to_label = {}
    
    # Wikidata API allows up to 50 ids per request
    chunk_size = 50
    for i in tqdm(range(0, len(ids), chunk_size), desc="Fetching Wikidata entities"):
        chunk = ids[i:i+chunk_size]
        params = {
            "action": "wbgetentities",
            "ids": "|".join(chunk),
            "props": "labels",
            "languages": "en",
            "format": "json"
        }
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        
        entities = data.get("entities", {})
        for entity_id, entity_data in entities.items():
            label = entity_data.get("labels", {}).get("en", {}).get("value")
            if label:
                id_to_label[entity_id] = label
        time.sleep(1)
                
    return id_to_label

def label_to_uri_fragment(label):
    """
    Convert a label string into a URI-safe fragment with underscores.
    """
    safe_label = re.sub(r'[^a-zA-Z0-9]', '_', label.strip())
    safe_label = re.sub(r'_+', '_', safe_label)
    safe_label = safe_label.strip('_')
    return safe_label

def replace_uri(uri, id_to_label):
    """
    Replace the Q or P ID in the Wikidata URI with the label-based fragment.
    """
    if not uri.startswith("http://www.wikidata.org/"):
        return uri
    
    parts = uri.rstrip('/').split('/')
    last_part = parts[-1]
    
    if last_part in id_to_label:
        new_fragment = label_to_uri_fragment(id_to_label[last_part])
        if new_fragment:
            parts[-1] = new_fragment
            return "/".join(parts)
    return uri

def process_triples(triples, id_to_label):
    """
    Replace all Q and P IDs in triples with label-based URIs.
    """
    new_triples = []
    for triple in tqdm(triples, desc="Processing Triples"):
        subj = "http://www.wikidata.org/" + replace_uri(triple['subject'], id_to_label).split("/")[-1]
        pred = "http://www.wikidata.org/" + replace_uri(triple['predicate'], id_to_label).split("/")[-1]
        
        if "http://www.wikidata.org/entity/statement/" in triple['object']:
            time.sleep(5)
            obj = resolve_statement_object( triple['subject'],  triple['predicate'], triple['object'])
        else:
            obj = "http://www.wikidata.org/" + triple['object'].split("/")[-1]
        if isinstance(obj, str) and obj.startswith("http://www.wikidata.org/"):
            obj = replace_uri(obj, id_to_label)
        new_triples.append({
            'subject': subj,
            'predicate': pred,
            'object': obj
        })
    return new_triples


def resolve_statement_object(subject, predicate, obj):
    """Get the object of a Wikidata statement URI."""
    sparql = SPARQLWrapper(WIKIDATA_SPARQL)
    
    sparql.setQuery(f"""
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX ps: <http://www.wikidata.org/prop/statement/>
    PREFIX p: <http://www.wikidata.org/prop/>
    PREFIX wikibase: <http://wikiba.se/ontology#>

    SELECT ?propertyLabel ?valueLabel WHERE {{
      wd:{subject.split("/")[-1]} p:{predicate.split("/")[-1]} ?statement .
      FILTER(STR(?statement) = "{obj}")

      ?statement ps:{predicate.split("/")[-1]} ?value .

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    if results["results"]["bindings"]:
        return "http://www.wikidata.org/" + results["results"]["bindings"][0]["valueLabel"]["value"]
    return "http://www.wikidata.org/" + obj.split("/")[-1]

    
def safe_uri(text):
    return text.strip().replace(" ", "_").replace('"', '').replace("'", "").replace("-", "_").replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace("::", "").replace(",", "").rstrip(".")

# -------- Example usage --------

def import_from_wikidata():
    
    with open('players_full.json', encoding='utf-8') as f:
        data = json.load(f)
    
    print("Extracting unique IDs from triples...")
    ids = extract_ids(data)
    
    print("Fetching labels from Wikidata API...")
    
    print("Replacing URIs with label-based URIs...")
    new_triples = process_triples(data, id_to_label)

    
    ttl_lines = [
    "@prefix ns1: <http://example.org/> .",
    "@prefix foaf: <http://xmlns.com/foaf/0.1/> .",
    ""
    ]

    for triple in new_triples:
        s = "ns1:" + safe_uri(triple['subject']).split("/")[-1]
        p = "ns1:" + safe_uri(triple['predicate']).split("/")[-1].replace("-", "_").replace("(", "").replace(")", "").replace("[", "").replace("]", "").rstrip(".")
        o = "ns1:" + safe_uri(triple['object']).split("/")[-1].replace("-", "_").replace("(", "").replace(")", "").replace("[", "").replace("]", "").rstrip(".")

        ttl_lines.append(f"{s} {p} {o} .")

    with open("graphdb_import.ttl", "w", encoding="utf-8") as f:
        f.write("\n".join(ttl_lines))

    print("✅ Converted to Turtle and saved as 'graphdb_import.ttl'")    
    
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
ENTITY_BASE = "http://www.wikidata.org/entity/"
STATEMENT_PATTERN = re.compile(r"http://www.wikidata.org/entity/statement/([^/]+)")
import_from_wikidata()

In [1]:
from llama_cpp import Llama
from huggingface_hub import login
# TODO: SPECIFY TOKEN FOR USABILITY
token=""
login(token)

In [2]:
import requests
from newspaper import Article

def get_articles(topic, limit):
    url = f"https://api.thenewsapi.com/v1/news/top?api_token=XY1kog3ePBk7Kh1uXB2m3hfaox7kHwAxCOw4fATy&search={topic}&language=en&limit={limit}"
    allArts = []
    response = requests.get(url)
    data = response.json()
    
    for art in data["data"]:
        article = Article(art["url"])

        try:
            article.download()
            article.parse()
        except requests.exceptions.HTTPError as err:
            print(f"HTTP Error: {err}")
            continue
        except Exception as e:
            print(f"An error occurred: {e}")
            continue
            
            

        allArts.append(article.text)
    
    return allArts


In [3]:
def get_entities(article, labels, model):
    
    entities = model.predict_entities(article, labels, threshold=0.5)

    filtered_entities = [entity for entity in entities if entity["score"] > 0.7]
    
    
    return filtered_entities

In [21]:
def ensure_full_name(entities):
    
    for i in range(len(entities)):
        if (entities[i]['label'] != "person"):
            continue
        
        prompt = f"""I have this person as entity: {entities[i]['text']}, 
        give me their full name without title but having first and last name. 
        Just return the name without anything else!
        If you cannot find it just return the input! DO NOT Tell me things like 'Unidentified', rather just return the input"""
        
        response = ollama.chat(model='mistral', messages=[
          {'role': 'user', 'content': prompt}
        ])
        listOfResponses = response["message"]["content"]
        entities[i]['text'] = listOfResponses
    
    return entities

In [27]:
def get_rdf_triples(article, entities):
    
    if not entities:
        return "No list"
    
    allTriples = []
    
    for i in range(len(entities)):
        if (entities[i]['label'] != "person"):
            continue
        entity = entities[i]["text"]
        others = ", ".join([entities[a]["text"] for a in range(i+1, len(entities)) if a != i])
    
        prompt = f"Does {entity} have a relation in the TEXT below with any of {others}? \n"
        prompt += "If yes please extract the RDF-Triple(s) that defines this relationship(s) based on the TEXT only. \n"
        prompt += "Do not INFER anything, if you are not sure, DO NOT create a triple. \n"
        prompt += f"Just return the triples, one per line as (head, relation, tail), the head MUST BE {entity}!!!!\n"
        prompt += f"TEXT: {article}\n"
    
        print(len(entities))
        print("PROMPTING")
        allTriples.extend(prompt_llm(prompt))

    return allTriples

In [6]:
import ollama

def prompt_llm(prompt):

    response = ollama.chat(model='mistral', messages=[
      {'role': 'user', 'content': prompt}
    ])
    listOfResponses = response["message"]["content"].split("\n")
    listOfResponses = [i.partition(". ")[2] for i in listOfResponses]
    return listOfResponses

In [111]:
from SPARQLWrapper import SPARQLWrapper, POST, URLENCODED
from rdflib import Graph, URIRef, Namespace
from pyoxigraph import Store, NamedNode, Triple

def upload_to_graph(listOfResponses):

    EX = Namespace("http://example.org/")
    g = RDFStarGraph()

    for triple in listOfResponses:
        if len(triple.split(",")) != 3:
            continue
        subject, predicate, obj = [x.strip(' _"')
                                   .replace("'", "")
                                   .replace(";", "")
                                   .replace("(", "")
                                   .replace(")", "")
                                   .replace('"', "")
                                   .replace(".", "")
                                   .replace("[", "")
                                   .replace("]", "")
                                   .replace("*", "")
                                   .replace("@", "")
                                   .replace("£", "")
                                   .replace("/", "")
                                   .replace("’", "")
                                   .replace("€", "")
                                   .replace("$", "")
                                   .replace("?", "")
                                   .replace("\\", "") for x in triple.strip("()").strip().replace(' ', '_').split(",")]
        if check_if_exists(subject, predicate, obj): 
            g.add(URIRef(EX[subject]), URIRef(EX[predicate]), URIRef(EX[obj]), (URIRef(EX["certainty"]), 0.5))

    url = "http://localhost:7200/repositories/test/statements"

    try:
        ttlOutput = g.serialize(format="turtle")
    except Warning as w:
        print(f"Warning caught: {w}")
        return "Invalid RETURN"

    print(ttlOutput)
    
    response = requests.post(
            url,
            headers={"Content-Type": "application/x-turtle; charset=utf-8"},
            data=ttlOutput
        )

    if response.status_code == 204:
        return "Upload successful."
    else:
        return f"Failed with status code {response.status_code} because of {response.text}"

In [8]:
from SPARQLWrapper import SPARQLWrapper, JSON, POST

def check_if_exists(subject, predicate, obj):
    
    ENDPOINT = "http://localhost:7200/repositories/test"

    sparql = SPARQLWrapper(ENDPOINT)
    sparql.setReturnFormat(JSON)

    query = f"""PREFIX ex:<http://example.org/>
        SELECT ?subject ?predicate ?object
        WHERE {{
          ?subject ?predicate ?object .
          FILTER (
            ?subject = ex:{subject} &&
            ?predicate = ex:{predicate} &&
            ?object = ex:{obj}
          )
        }}
    """
    
    print(query)
    sparql.setQuery(query)

    results = sparql.queryAndConvert()
    if len(results["results"]["bindings"]) > 0:
        return update_entry(subject, predicate, obj)
    else: return "NOT YEY"

def update_entry(subject, predicate, obj):
    
    ENDPOINT = "http://localhost:7200/repositories/test"
    sparql = SPARQLWrapper(ENDPOINT)
    sparql.setReturnFormat(JSON)
    
    sparql.setQuery(f"""
        PREFIX ex: <http://example.org/>
    SELECT ?property ?value
    WHERE {{
    <<
    ex:{subject}
    ex:{predicate}
    ex:{obj}
    >> 
        ?property ?value .
    }}
    """)


    results = sparql.queryAndConvert()
    
    if len(results["results"]["bindings"]) > 0:
        value = float(results["results"]["bindings"][0]["value"]["value"])
    else: 
        return True
    
    ENDPOINT = "http://localhost:7200/repositories/test/statements"

    sparql = SPARQLWrapper(ENDPOINT)
    sparql.setMethod(POST)

    sparql.setQuery(f"""PREFIX ex:<http://example.org/>
    DELETE {{
        << ex:{subject} ex:{predicate} ex:{obj} >> ex:certainty ?cert
    }}
    INSERT {{
        << ex:{subject} ex:{predicate} ex:{obj} >> ex:certainty {value+(1-value)*0.1}
    }}
    WHERE {{
        << ex:{subject} ex:{predicate} ex:{obj} >> ex:certainty ?cert .
    }}
    """)
    
    print(f"""PREFIX ex:<http://example.org/>
    DELETE {{
        << ex:{subject} ex:{predicate} ex:{obj} >> ex:certainty ?cert
    }}
    INSERT {{
        << ex:{subject} ex:{predicate} ex:{obj} >> ex:certainty {value+(1-value)*0.1}
    }}
    WHERE {{
        << ex:{subject} ex:{predicate} ex:{obj} >> ex:certainty ?cert .
    }}
    """)
    results = sparql.query()
    
    return False

In [9]:
class RDFStarGraph:
    def __init__(self):
        self.triples = []
    
    def add(self, subject, predicate, obj, metadata=None):
        """
        Add a triple or RDF-Star quoted triple with metadata.
        """
        
        triple = f"{subject} {predicate} {obj}."
        self.triples.append(triple)
        if metadata:
            triple = f"<< {subject} {predicate} {obj} >> {metadata[0]} {metadata[1]}."
            self.triples.append(triple)

    def serialize(self, format="turtle"):
        """
        Serialize the RDF-Star graph in Turtle format.
        """
        if format.lower() == "turtle":
            return "@prefix ex: <http://example.org/> .\n\n" + "\n".join([t.replace("http://example.org/", "ex:") for t in self.triples])
        else:
            raise ValueError("Unsupported format. Use 'turtle'.")

In [None]:
from gliner import GLiNER

labels = [
    "person",
    "company",
    "country",
    "city",
    "date",
    "event",
    "job",
    "product",
    "quote",
    "topic",
    "organization",
    "law",
    "crime",
    "conflict",
    "scientist"
]

model = GLiNER.from_pretrained("urchade/gliner_large-v2.1", max_length=3000)

allFoundArticles = get_articles("Kevin De Bruyne", 15)

for article in allFoundArticles:
    for paragraph in [p.strip() for p in article.strip().split('\n\n') if p.strip()]:
        print("--------------ARTICLE----------------")
        print(paragraph)
        entities = get_entities(paragraph, labels, model)
        enitites = ensure_full_name(entities)
        print("--------------ENTITIES----------------")
        print(entities)
        triples = get_rdf_triples(paragraph, entities)
        print(triples)
        if triples == "No list":
            continue
        print("--------------TRIPLES----------------")
        print(triples)
        print(upload_to_graph(triples))


In [93]:
import ast

ENDPOINT = "http://localhost:7200/repositories/test"

sparql = SPARQLWrapper(ENDPOINT)
sparql.setReturnFormat(JSON)

query = f"""PREFIX ex:<http://example.org/>
    SELECT ?subject ?predicate ?object
    WHERE {{
      ?subject ?predicate ?object .
      FILTER (
        ?subject = ex:Florian_Wirtz &&
        ?object = ex:Bayer_Leverkusen
      )
    }}
"""

print(query)
sparql.setQuery(query)

results = sparql.queryAndConvert()
if len(results["results"]["bindings"]) > 0:
    print(results["results"]["bindings"])
    
    triples = []
    for res in results["results"]["bindings"]:
        triple = (res["subject"]["value"].split("/")[-1], 
                 res["predicate"]["value"].split("/")[-1],
                 res["object"]["value"].split("/")[-1])
        triples.append(triple)
    
    print(triples)
    
    
    prompt = "I have several Triples from a Knowledge Graph, which ones are contradicting or simply wrong?\n"
    prompt += "If two or more are contradicting return the ones that are more likely wrong! \n"
    prompt += "\n".join(str(triple) for triple in triples)
    prompt += "\nReturn JUST the triples, one per line, NO OTHER TEXT!"
    
    print(prompt)
    
    response = ollama.chat(model='mistral', messages=[
          {'role': 'user', 'content': prompt}
        ])
    wrongTriples = response["message"]["content"].split("\n")
    
    prompt = "For each of the following triples, tell me if it is time-sensitive or not. Write 'yes' or 'no' and separate by comma"
    prompt += "Do not return the input just yes or no, do not elaborate, I only want 'yes' or 'no' in each line\n"
    prompt += "\n".join(wrongTriples)
    
    response = ollama.chat(model='mistral', messages=[
          {'role': 'user', 'content': prompt}
        ])
    timeSensitives = response["message"]["content"].split(",")
    
    print(timeSensitives)
    
    wrongTriples = ast.literal_eval(f'''{wrongTriples}''')
    wrongTriples = [ast.literal_eval(s.strip()) for s in wrongTriples]
    
    timeSensitives = [s.strip().lower() for s in timeSensitives]

    for i in range(len(wrongTriples)):
        wrongTriple = wrongTriples[i]
        subject, predicate, obj = wrongTriple
        
        ENDPOINT = "http://localhost:7200/repositories/test"
        sparql = SPARQLWrapper(ENDPOINT)
        sparql.setReturnFormat(JSON)

        sparql.setQuery(f"""
            PREFIX ex: <http://example.org/>
        SELECT ?property ?value
        WHERE {{
        <<
        ex:{subject}
        ex:{predicate}
        ex:{obj}
        >> 
            ?property ?value .
        }}
        """)
        
        results = sparql.queryAndConvert()
        
    
        if len(results["results"]["bindings"]) > 0:
            value = float(results["results"]["bindings"][0]["value"]["value"])
        else: 
            value = 0.5

        
        if timeSensitives[i] == "yes":
            print(wrongTriple, "was deemed wrong and time sensitive, confidence will be lowered")
            # If time sensitve, then just lower confidence as it might change
            ENDPOINT = "http://localhost:7200/repositories/test/statements"

            sparql = SPARQLWrapper(ENDPOINT)
            sparql.setMethod(POST)

            sparql.setQuery(f"""PREFIX ex:<http://example.org/>
            DELETE {{
                << ex:{subject} ex:{predicate} ex:{obj} >> ex:certainty ?cert
            }}
            INSERT {{
                << ex:{subject} ex:{predicate} ex:{obj} >> ex:certainty {value*0.5}
            }}
            WHERE {{
                << ex:{subject} ex:{predicate} ex:{obj} >> ex:certainty ?cert .
            }}
            """)
            results = sparql.query()
            
        elif timeSensitives[i] == "no":
            print(wrongTriple, "was deemed wrong and time insensitive, will be deleted")
            # If not time sensitve, then just delete, as info might be wrong
            ENDPOINT = "http://localhost:7200/repositories/test/statements"

            sparql = SPARQLWrapper(ENDPOINT)
            sparql.setMethod(POST)

            sparql.setQuery(f"""PREFIX ex:<http://example.org/>
            DELETE {{
                << ex:{subject} ex:{predicate} ex:{obj} >> ex:certainty ?cert
            }}
            WHERE {{
                << ex:{subject} ex:{predicate} ex:{obj} >> ex:certainty ?cert .
            }}
            """)
            results = sparql.query()
        else:
            print("What the fuck!")
                  
else: print("NOT YEY")

PREFIX ex:<http://example.org/>
    SELECT ?subject ?predicate ?object
    WHERE {
      ?subject ?predicate ?object .
      FILTER (
        ?subject = ex:Florian_Wirtz &&
        ?object = ex:Bayer_Leverkusen
      )
    }

[{'subject': {'type': 'uri', 'value': 'http://example.org/Florian_Wirtz'}, 'predicate': {'type': 'uri', 'value': 'http://example.org/is_associated_with'}, 'object': {'type': 'uri', 'value': 'http://example.org/Bayer_Leverkusen'}}, {'subject': {'type': 'uri', 'value': 'http://example.org/Florian_Wirtz'}, 'predicate': {'type': 'uri', 'value': 'http://example.org/belongs_to'}, 'object': {'type': 'uri', 'value': 'http://example.org/Bayer_Leverkusen'}}, {'subject': {'type': 'uri', 'value': 'http://example.org/Florian_Wirtz'}, 'predicate': {'type': 'uri', 'value': 'http://example.org/plays_for'}, 'object': {'type': 'uri', 'value': 'http://example.org/Bayer_Leverkusen'}}, {'subject': {'type': 'uri', 'value': 'http://example.org/Florian_Wirtz'}, 'predicate': {'type': 'uri

In [101]:
import ast

ENDPOINT = "http://localhost:7200/repositories/test"

sparql = SPARQLWrapper(ENDPOINT)
sparql.setReturnFormat(JSON)

query = f"""PREFIX ex:<http://example.org/>
    SELECT ?subject ?predicate ?object
    WHERE {{
      ?subject ?predicate ?object .
      FILTER (
        ?subject = ex:Florian_Wirtz &&
        ?object = ex:Bayer_Leverkusen
      )
    }}
"""

print(query)
sparql.setQuery(query)

results = sparql.queryAndConvert()
if len(results["results"]["bindings"]) > 0:
    prompt = "I have several Triples from a Knowledge Graph, are there some with exactly the same information?\n"
    prompt += "Be very very strict\n"
    prompt += "\n".join(str(triple) for triple in triples)
    prompt += "\n Group the ones with the same information together, just the triples, NO OTHER TEXT!"
    
    print(prompt)
    
    response = ollama.chat(model='mistral', messages=[
          {'role': 'user', 'content': prompt}
        ])
    duplicateTriples = response["message"]["content"]
    print(duplicateTriples)

PREFIX ex:<http://example.org/>
    SELECT ?subject ?predicate ?object
    WHERE {
      ?subject ?predicate ?object .
      FILTER (
        ?subject = ex:Florian_Wirtz &&
        ?object = ex:Bayer_Leverkusen
      )
    }

I have several Triples from a Knowledge Graph, are there some with exactly the same information?
Be very very strict
('Florian_Wirtz', 'is_associated_with', 'Bayer_Leverkusen')
('Florian_Wirtz', 'belongs_to', 'Bayer_Leverkusen')
('Florian_Wirtz', 'plays_for', 'Bayer_Leverkusen')
('Florian_Wirtz', 'playsFor', 'Bayer_Leverkusen')
('Florian_Wirtz', 'has_previous_club', 'Bayer_Leverkusen')
('Florian_Wirtz', 'belongsTo', 'Bayer_Leverkusen')
('Florian_Wirtz', 'currentlyPlaysFor', 'Bayer_Leverkusen')
('Florian_Wirtz', 'memberOf', 'Bayer_Leverkusen')
Return JUST the pairs of the duplicates, TWO TRIPLES per line, NO OTHER TEXT!
 ('Florian_Wirtz', 'is_associated_with', 'Bayer_Leverkusen')
                  ('Florian_Wirtz', 'belongs_to', 'Bayer_Leverkusen')
                