# PDF Text Extraction

In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path, start_page=12):
    doc = fitz.open(pdf_path)
    text = ""
    
    # Iterate starting from the given page (start_page - 1, it skips title page and list of contents)
    for page_num in range(start_page - 1, len(doc)):
        page = doc[page_num]
        text += page.get_text("text").replace("\n", " ")
    
    return text

pdf_path = "dane/PrzepisyGry_2024_25.pdf"
extracted_text1 = extract_text_from_pdf(pdf_path, start_page=12)

# Display the first 100 characters of the text
print(extracted_text1[:100])  

        Duch gry – filozofia  Przepisów  Piłka nożna jest najpiękniejszym sportem na świecie. Uprawi


# Load Additional Explanations

In [None]:
with open("dane/explanations.txt", "r", encoding="utf-8") as f:
    extracted_text2= f.read()
print(extracted_text2[:100])

Wymiary wykonywane są od zewnętrznej krawędzi linii, jako że linie te należą do powierzchni, których


# Combine Extracted Texts

In [None]:
extracted_text = extracted_text1 + "\n" + extracted_text2

## URI Cleaning

In [None]:
import re
import urllib.parse

def clean_for_uri(text):
    text = text.lower()
    text = re.sub(r"[ąćęłńóśźż]", lambda m: {
        'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l',
        'ń': 'n', 'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'
    }[m.group()], text)
    text = text.replace(" ", "_")
    text = re.sub(r"[^\w\-]", "", text)
    # NOTE: here we use urllib.parse.quote, which encodes to URI-safe format
    return urllib.parse.quote(text)


## Entity Dictionary
This section defines an extended dictionary of entities relevant to football (soccer) rules.

In [None]:
entities = {
    "LINIA": [
        "linia", "linia boczna", "linia bramkowa", "linia środkowa", 
        "punkt", "punkt karny", "punkt środkowy", 
        "linia pola karnego", "linia pola bramkowego", "linia pola rożnego", 
        "łuk pola karnego", "łuk pola rożnego", "linia spalonego"
    ],
    "POZYCJA": [
        "zawodnik", "bramkarz", "obrońca", "środkowy obrońca", "boczny obrońca",
        "pomocnik", "pomocnik defensywny", "pomocnik ofensywny", "skrzydłowy",
        "napastnik", "rezerwowy", "zawodnik wymieniony", "kapitan drużyny",
        "trener", "asystent trenera", "sędzia", "sędzia główny", 
        "sędzia asystent", "sędzia techniczny", "sędzia VAR", 
        "operator VAR", "lekarz drużyny", "fizjoterapeuta", "kierownik drużyny"
    ],
    "SPRZĘT": [
        "sprzęt", "piłka", "piłka meczowa", "buty", "korki", "strój", 
        "getry", "ochraniacze", "ochraniacze goleni", "rękawice bramkarskie",
        "chorągiewka", "chorągiewka boczna", "bramka", "siatka", 
        "poprzeczka", "słupek", "tablica zmian", "gwizdek", "zegarek sędziego", 
        "kamera VAR", "monitor VAR", "system GLT", "system VAR"
    ],
    "CZYNNOŚĆ": [
        "czynność", "rozpoczęcie gry", "wznowienie gry", "rzut karny", 
        "rzut wolny", "rzut wolny pośredni", "rzut wolny bezpośredni", 
        "rzut z autu", "rzut rożny", "rzut od bramki", "dogrywka", 
        "rzuty karne", "wrzut", "drybling", "pressing", "odbiór piłki", 
        "strzał", "strzał na bramkę", "podanie", "asysta", "zagranie ręką",
        "przewinienie", "przewinienie taktyczne", "symulacja", "opóźnianie gry",
        "brutalność", "gra niebezpieczna", "korzyść sędziowska", "wideoweryfikacja"
    ],
    "CZAS": [
        "czas", "czas gry", "pierwsza połowa", "druga połowa", 
        "doliczony czas", "doliczony czas gry", "dogrywka", "rzuty karne", 
        "seria rzutów karnych", "przerwa", "przerwa w grze", 
        "przerwa na chłodzenie", "czas zawieszenia", "czas przewinienia"
    ],
    "ORGANIZACJA": [
        "organizacja", "IFAB", "FIFA", "UEFA", "PZPN", "konfederacja", 
        "federacja krajowa", "komisja sędziowska", "organizator rozgrywek"
    ],
    "SANKCJA": [
        "sankcja", "żółta kartka", "czerwona kartka", "napomnienie", 
        "wykluczenie", "kara wychowawcza", "zawieszenie czasowe", 
        "zawieszenie meczowe", "kara techniczna", "upomnienie słowne"
    ],
    "OSOBA_FUNCKYJNA": [
        "osoba funkcyjna", "trener", "asystent trenera", "fizjoterapeuta", 
        "lekarz drużyny", "analityk wideo", "kierownik drużyny", 
        "sędzia techniczny", "delegat meczu", "operator VAR"
    ],
    "STAN_GRY": [
        "stan gry", "spalony", "przewinienie", "faul", "symulacja", 
        "opóźnianie gry", "brutalność", "gra niebezpieczna", 
        "korzyść sędziowska", "nieuznana bramka", "kontuzja zawodnika"
    ],
    "DECYZJA_SEDZIEGO": [
        "decyzja sędziego", "gwizdek rozpoczęcia", "gwizdek zakończenia", 
        "pokazanie kartki", "przyznanie rzutu karnego", "przyznanie rzutu wolnego", 
        "wznowienie gry", "przerwanie gry", "korzyść", "wideoweryfikacja VAR", 
        "anulowanie bramki", "przyznanie gola", "rzut sędziowski"
    ],
    "WARUNKI_ZEWNETRZNE": [
        "warunki zewnętrzne", "stan murawy", "rodzaj nawierzchni", 
        "sztuczna murawa", "naturalna murawa", "system hybrydowy", 
        "warunki pogodowe", "oświetlenie stadionowe", "warunki widoczności"
    ],
    "ELEMENTY_WYDARZENIA": [
        "element wydarzenia", "gol", "zdobycie bramki", "utrata bramki", 
        "interwencja bramkarza", "obrona strzału", "niecelny strzał", 
        "interwencja VAR", "zmiana zawodnika", "przewinienie taktyczne", 
        "kontuzja", "decyzja o dogrywce"
    ],
    "AKCJA": ["wznowić grę", "rozpocząć grę", "przerwać grę"]

}

## NLP and RDF Setup
This section initializes the NLP tools (spaCy and Stanza) and sets up the RDF graph.

In [None]:
import spacy
import stanza
from spacy.matcher import PhraseMatcher, Matcher
from rdflib import Graph, Namespace, URIRef, RDF, RDFS, Literal


# Spacy initialization
nlp = spacy.load("pl_core_news_lg")
nlp.max_length = 2000000
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
matcher = Matcher(nlp.vocab)

# Stanza initialization
stanza.download('pl')
nlp_stanza = stanza.Pipeline('pl')

# RDF setup
EX = Namespace("http://example.org/")
g = Graph()
g.bind("ex", EX)
g.bind("rdfs", RDFS)

## Helper Functions for Entity Matching
This section defines helper functions to find the longest matching entity and to map tokens to entities.

In [None]:
def get_entity_for_token(token, doc, resources):
    """
    Returns the longest entity from resources that covers the token in the document.
    """
    for ent_text in sorted(resources, key=len, reverse=True):
        for i in range(len(doc) - len(ent_text.split()) + 1):
            span = doc[i:i+len(ent_text.split())]
            if all(t.lemma_.lower() == e for t, e in zip(span, ent_text.split())):
                if token.i >= span.start and token.i < span.end:
                    return ent_text
    return token.lemma_.lower()

In [None]:
def find_longest_entity(text, resources):
    """
    Returns the longest entity from resources that contains the text.
    """
    matches = [ent_text for ent_text in resources if text in ent_text]
    if matches:
        # Return the longest matching phrase
        return max(matches, key=len)
    return text


In [None]:

def find_covering_entity(token_idx, entity_spans):
    """
    Finds the longest entity that covers the token at the given index.
    """
    covering = [ent for (start, end, ent) in entity_spans if start <= token_idx < end]
    if covering:
        # choose the longest (most words)
        return max(covering, key=lambda x: len(x.split()))
    return None

## RDF Graph Construction
This section processes the extracted text, matches entities and actions, and builds the RDF graph with relationships.

In [None]:
from rdflib import URIRef, RDF, RDFS, Literal



# Add patterns to PhraseMatcher
for label, terms in entities.items():
    patterns = [nlp.make_doc(term) for term in terms]
    phrase_matcher.add(label, patterns)

# Add patterns to Matcher
for label, terms in entities.items():
    for term in terms:
        matcher.add(label, [[{"LEMMA": t} for t in term.split()]])

# Example sentence
#text = "VAR referee analyzes the situation on the field. The player takes a penalty kick. The player cannot touch the ball with his hand."
#extracted_text = text
# NLP Spacy
doc = nlp(extracted_text)

# PhraseMatcher matching
phrase_matches = phrase_matcher(doc)

# Matcher matching
lemma_matches = matcher(doc)

matches = phrase_matches + lemma_matches

# Create RDF entities
resources = {}

for match_id, start, end in matches:
    label = nlp.vocab.strings[match_id]
    span = doc[start:end]
    span_lemma = " ".join([token.lemma_.lower() for token in span])
    span_lemma = span_lemma.lower()
    if span_lemma not in resources:
        uri = URIRef(EX[clean_for_uri(span_lemma)])
        resources[span_lemma] = uri


# Add Stanza analysis (add negation handling at the Stanza heuristic level)
doc_stanza = nlp_stanza(extracted_text)
all_found_entities = []  # Collect entities from all sentences
for sentence in doc_stanza.sentences:
    lemmas = [w.lemma.lower() for w in sentence.words]
    # Find all entities in the sentence (longest matching phrases)
    found_entities = []
    used = set()
    for ent_text in sorted(resources, key=lambda x: -len(x.split())):
        ent_lemmas = ent_text.split()
        for i in range(len(lemmas) - len(ent_lemmas) + 1):
            if set(range(i, i+len(ent_lemmas))) & used:
                continue
            if lemmas[i:i+len(ent_lemmas)] == ent_lemmas:
                found_entities.append((i, i+len(ent_lemmas), ent_text))
                used.update(range(i, i+len(ent_lemmas)))
    found_entities = sorted(found_entities, key=lambda x: x[0])

    # After building found_entities:
    filtered_entities = []
    for i, (start_i, end_i, ent_i) in enumerate(found_entities):
        is_sub = False
        for j, (start_j, end_j, ent_j) in enumerate(found_entities):
            if i != j and start_j <= start_i and end_j >= end_i and (end_j - start_j) > (end_i - start_i):
                is_sub = True
                break
        if not is_sub:
            filtered_entities.append((start_i, end_i, ent_i))
    found_entities = filtered_entities

    all_found_entities.extend(found_entities)


    # Find verbs
    verbs = [w for w in sentence.words if w.upos == 'VERB']

    # Simple heuristic: entity before the verb is subject, after the verb is object
    for verb in verbs:
        verb_lemma = verb.lemma.lower()
        # Negation detection in Stanza: is there 'nie' or upos == 'PART' with text 'nie' among dependents
        is_negated = any((w.deprel == 'advmod:neg' or w.text.lower() == 'nie') for w in sentence.words if w.head == verb.id)
        verb_predicate = f"nie_{verb_lemma}" if is_negated else verb_lemma
        verb_uri = URIRef(EX[clean_for_uri(verb_predicate)])
        # Find the entity closest before the verb
        subj = None
        obj = None
        for start, end, ent in found_entities:
            if end <= verb.id - 1:
                subj = ent
            elif start >= verb.id:
                obj = ent
                break
        if subj:
            subj_uri = resources.get(subj)
            if subj_uri:
                g.add((subj_uri, EX['wykonuje'], verb_uri))
        if obj:
            obj_uri = resources.get(obj)
            if obj_uri:
                g.add((verb_uri, EX['dotyczy'], obj_uri))
        if subj and obj:
            print(f"Dodano RDF (heurystyka): {subj} --wykonuje--> {verb_predicate} --dotyczy--> {obj}")

# --- GLOBAL DEDUPLICATION AND ADD ONLY THE LONGEST, NON-OVERLAPPING ENTITIES TO RDF ---
# Collect all entities (ent_text) from all sentences
all_entity_spans = [(start, end, ent_text) for (start, end, ent_text) in all_found_entities]

# Sort: first by start, then by length (descending), then by text length (descending)
all_entity_spans = sorted(all_entity_spans, key=lambda x: (x[0], -(x[1]-x[0]), -len(x[2])))
final_spans = []
final_entities = set()
for i, (start_i, end_i, ent_i) in enumerate(all_entity_spans):
    overlap = False
    for (start_j, end_j, ent_j) in final_spans:
        # Check if ranges overlap
        if not (end_i <= start_j or start_i >= end_j):
            overlap = True
            break
    if not overlap:
        final_spans.append((start_i, end_i, ent_i))
        final_entities.add(ent_i)

# Spacy dependency analysis (prefer the longest entities from final_spans) with negation handling
for token in doc:
    if token.pos_ == "VERB":
        verb = token.lemma_.lower()
        # Negation detection (e.g. 'nie' as a child of the verb or dep_ == 'neg')
        is_negated = any(child.dep_ == "neg" or child.text.lower() == "nie" for child in token.children)
        verb_predicate = f"nie_{verb}" if is_negated else verb
        subject = [child for child in token.children if child.dep_ in ("nsubj", "nsubj:pass")]
        obj = [child for child in token.children if child.dep_ in ("obj", "dobj", "obl")]

        # Prefer the longest entity covering the subject/object token (from final_spans)
        if subject and obj:
            subj_ent = find_covering_entity(subject[0].i, final_spans) or subject[0].lemma_.lower()
            obj_ent = find_covering_entity(obj[0].i, final_spans) or obj[0].lemma_.lower()
            subj_uri = resources.get(subj_ent)
            obj_uri = resources.get(obj_ent)
            if subj_uri and obj_uri:
                g.add((subj_uri, EX[clean_for_uri(verb_predicate)], obj_uri))
        # Location and other adverbials
        for child in token.children:
            if child.dep_ == "obl":
                loc_text = " ".join([t.text.lower() for t in child.subtree])
                loc_ent = find_longest_entity(loc_text, resources)
                loc_uri = resources.get(loc_ent)
                if loc_uri:
                    g.add((URIRef(EX[clean_for_uri(verb_predicate)]), EX["miejsce"], loc_uri))
                    
# Add RDF type/label only for final_entities (unique, non-overlapping entities)
for ent_text in final_entities:
    uri = resources[ent_text]
    # Find label (type) from Spacy entities
    label = None
    for match_id, s, e in matches:
        span = doc[s:e]
        span_lemma = " ".join([token.lemma_.lower() for token in span]).lower()
        if span_lemma == ent_text:
            label = nlp.vocab.strings[match_id]
            break
    if label:
        class_uri = URIRef(EX[clean_for_uri(label.capitalize())])
        if (uri, RDF.type, class_uri) not in g:
            g.add((uri, RDF.type, class_uri))
        # Add only ONE label: original text from the document (not lemma, no duplicates!)
        # Collect all original texts for this entity (preserve case)
        orig_texts = set()
        for match_id, s, e in matches:
            span = doc[s:e]
            span_lemma = " ".join([token.lemma_.lower() for token in span]).lower()
            if span_lemma == ent_text:
                orig_texts.add(span.text)
        # Add only ONE label: original text from the document (longest, if there are different versions)
        if orig_texts:
            # Prefer the longest (e.g. "VAR referee" instead of "referee")
            best_label = max(orig_texts, key=len)
            if (uri, RDFS.label, Literal(best_label)) not in g:
                g.add((uri, RDFS.label, Literal(best_label)))

# Serialization
#print(g.serialize(format="turtle", encoding="utf-8").decode("utf-8"))
output_file = "graf5s.rdf"
g.serialize(destination=output_file, format="turtle", encoding="utf-8")
print(f"RDF graph has been saved to file: {output_file}")
