# Tworzenie grafu wiedzy z pobranych wcześniej danych


In [38]:
from rdflib import Graph, Namespace
import re
from dataclasses import dataclass, field
from typing import List, Optional, Dict
from collections import defaultdict
from typing import Tuple
import json

In [30]:
DC = Namespace("http://purl.org/dc/elements/1.1/")

g = Graph()
g.parse("data/merged_graph_turtle.ttl", format="turtle")
print(f"Załadowano {len(g)} trójek")

Załadowano 3388 trójek


In [31]:
# funkcja normalizująca daty

def normalize(date_str):
    if not date_str:
        return None

    date_str = date_str.strip()

    # Wzorzec 1: zwykły rok (np. "1999")
    match = re.match(r"^(\d{4})$", date_str)
    if match:
        year = int(match.group(1))
        return {
            'original': date_str,
            'year': year,
            'century': (year - 1) // 100 + 1,
        }

    # Wzorzec 2: daty przybliżone (np. "ca 1999")
    match = re.match(r"^(ca|circa|c\.)\s*(\d{4})$", date_str, re.IGNORECASE)
    if match:
        year = int(match.group(2))
        return {
            'original': date_str,
            'year': year,
            'century': (year - 1) // 100 + 1,
            'is_approximate': True,
        }

    # Wzorzec 3: zakresy dat (np. "[1990-1999]")
    match = re.match(r"^\[?(\d{4})\s*-\s*(\d{4})\]?$", date_str)
    if match:
        start_year = int(match.group(1))
        end_year = int(match.group(2))
        return {
            'original': date_str,
            'year': start_year,
            'end_year': end_year,
            'century': (start_year - 1) // 100 + 1,
            'is_range': True,
        }

    # Wzorzec 4: wiek (np. "19 w.")
    match = re.match(r"^(\d{1,2})\s*w\.$", date_str)
    if match:
        century = int(match.group(1))
        year = (century - 1) * 100 + 50 # środek wieku
        return {
            'original': date_str,
            'year': year,
            'century': century,
            'is_approximate': True,
        }

    # Wzorzec 5: data w nawiasach kwadratowych (np. "[1990]")
    match = re.match(r"^\[?(\d{4})\s*\]?$", date_str)
    if match:
        year = int(match.group(1))
        return {
            'original': date_str,
            'year': year,
            'century': (year - 1) // 100 + 1,
        }

    return {
        'original': date_str,
        'year': None,
        'century': None,
        'is_unparsed': True,
    }

In [32]:
# test
test_dates = ["2004", "ca 1946", "[1930-1936]", "19 w.", "1893", "[1953]"]
for d in test_dates:
    print(f"{d} -> {normalize(d)}")

2004 -> {'original': '2004', 'year': 2004, 'century': 21}
ca 1946 -> {'original': 'ca 1946', 'year': 1946, 'century': 20, 'is_approximate': True}
[1930-1936] -> {'original': '[1930-1936]', 'year': 1930, 'end_year': 1936, 'century': 20, 'is_range': True}
19 w. -> {'original': '19 w.', 'year': 1850, 'century': 19, 'is_approximate': True}
1893 -> {'original': '1893', 'year': 1893, 'century': 19}
[1953] -> {'original': '[1953]', 'year': 1953, 'century': 20}


In [33]:
# klasa Document reprezentująca dokumenty w grafie
@dataclass
class Document:
    identifier: str
    title: str
    description: str = ""
    subjects: List[str] = field(default_factory=list)
    date_raw: str = ""
    date_normalized: Optional[Dict] = None
    year: Optional[int] = None
    century: Optional[int] = None
    creator: Optional[str] = None
    publisher: Optional[str] = None
    type: str = ""

    def __post_init__(self):
        if self.date_raw:
            self.date_normalized = normalize(self.date_raw)
            if self.date_normalized:
                self.year = self.date_normalized['year']
                self.century = self.date_normalized['century']

@dataclass
class Subject:
    name: str
    documents: List[str] = field(default_factory=list)

    def add_document(self, doc_id: str):
        if doc_id not in self.documents:
            self.documents.append(doc_id)

In [34]:
class Relation:
    def __init__(self, source_id: str, target_id: str, relation_type: str, weight: float):
        self.source_id = source_id
        self.target_id = target_id
        self.relation_type = relation_type
        self.weight = weight

class KnowledgeGraph:
    def __init__(self):
        self.documents: Dict[str, Document] = {}
        self.subjects: Dict[str, Subject] = {}
        self.relations: List[Relation] = []

        self.by_year: Dict[int, List[str]] = defaultdict(list)
        self.by_century: Dict[int, List[str]] = defaultdict(list)
        self.by_subject: Dict[str, List[str]] = defaultdict(list)

    def add_document(self, doc: Document):
        self.documents[doc.identifier] = doc

        if doc.year is not None:
            self.by_year[doc.year].append(doc.identifier)
        if doc.century is not None:
            self.by_century[doc.century].append(doc.identifier)

        for subject_name in doc.subjects:
            subject_key = subject_name.lower()
            self.by_subject[subject_key].append(doc.identifier)

            if subject_key not in self.subjects:
                self.subjects[subject_key] = Subject(name=subject_name)
            self.subjects[subject_key].add_document(doc.identifier)

    def build_relations(self):

        # 1. wspólne subjects
        for subject_key, doc_ids in self.by_subject.items():
            if len(doc_ids) > 1:
                for i, doc_id1 in enumerate(doc_ids):
                    for doc_id2 in doc_ids[i+1:]:
                        if doc_id1 != doc_id2:
                            self.relations.append(Relation(doc_id1, doc_id2, "shared_subject", 1.0))


        # 2. chronologicznie
        sorted_years = sorted(self.by_year.keys())
        for i in range(len(sorted_years) - 1):
            year1 = sorted_years[i]
            year2 = sorted_years[i + 1]
            doc_ids1 = self.by_year[year1]
            doc_ids2 = self.by_year[year2]
            if year2 - year1 <= 5:      # odległość do 5 lat
                for doc_id1 in doc_ids1:
                    for doc_id2 in doc_ids2:
                        self.relations.append(Relation(doc_id1, doc_id2, "close_next", 1.0))


        # 3. te same century
        for century, doc_ids in self.by_century.items():
            if len(doc_ids) > 1:
                for i, doc_id1 in enumerate(doc_ids):
                    for doc_id2 in doc_ids[i+1:]:
                        if doc_id1 != doc_id2:
                            self.relations.append(Relation(doc_id1, doc_id2, "same_century", 0.5))

    def get_related_documents(self, doc_id: str, max_results: int = 5) -> List[Tuple[str, str, float]]:
        related = defaultdict(float)

        for relation in self.relations:
            if relation.source_id == doc_id:
                related[relation.target_id] += relation.weight
            elif relation.target_id == doc_id:
                related[relation.source_id] += relation.weight

        sorted_related = sorted(related.items(), key=lambda x: x[1], reverse=True)
        result = []
        for doc_id, score in sorted_related[:max_results]:
            result.append((self.documents[doc_id], score))

        return result

In [None]:
def build_kg_from_rdf(rdf_graph):
    kg = KnowledgeGraph()

    docs_data = defaultdict(lambda: {
        'title': None,
        'identifier': None,
        'description': None,
        'date': None,
        'subjects': [],
        'creator': None,
        'publisher': None,
        'type': None,
    })

    query = """
    PREFIX dc: <http://purl.org/dc/elements/1.1/>
    SELECT ?doc ?title ?date ?description ?subject ?identifier ?creator ?publisher ?type
    WHERE {
        ?doc dc:title ?title .
        OPTIONAL { ?doc dc:date ?date }
        OPTIONAL { ?doc dc:description ?description }
        OPTIONAL { ?doc dc:subject ?subject }
        OPTIONAL { ?doc dc:identifier ?identifier }
        OPTIONAL { ?doc dc:creator ?creator }
        OPTIONAL { ?doc dc:publisher ?publisher }
        OPTIONAL { ?doc dc:type ?type }
    }
    """

    # zapisanie danych w odpowiednich zmiennych
    for row in rdf_graph.query(query):
        doc_uri = str(row.doc)

        if row.title:
            docs_data[doc_uri]['title'] = str(row.title)
        if row.date and not docs_data[doc_uri]['date']:
            docs_data[doc_uri]['date'] = str(row.date)
        if row.description:
            docs_data[doc_uri]['description'] = str(row.description)
        if row.subject:
            docs_data[doc_uri]['subjects'].append(str(row.subject))
        if row.identifier:
            docs_data[doc_uri]['identifier'] = str(row.identifier)
        if row.creator:
            docs_data[doc_uri]['creator'] = str(row.creator)
        if row.publisher:
            docs_data[doc_uri]['publisher'] = str(row.publisher)
        if row.type:
            docs_data[doc_uri]['type'] = str(row.type)

    # tworzenie obiektów Document i dodawanie ich do KnowledgeGraph
    for doc_uri, data in docs_data.items():
        if data['title'] and data['identifier']:
            doc = Document(
                identifier=data['identifier'],
                title=data['title'],
                description=data['description'] or "",
                subjects=data['subjects'],
                date_raw=data['date'] or  "",
                creator=data['creator'] or  "",
                publisher=data['publisher'] or  "",
                type=data['type'] or  "",
            )
            kg.add_document(doc)

    # budowanie relacji
    kg.build_relations()

    return kg

In [36]:
kg = build_kg_from_rdf(g)
print(f"Wczytano {len(kg.documents)} dokumentów do grafu wiedzy.")
print(f"Utworzono {len(kg.subjects)} tematów.")
print(f"Utworzono {len(kg.relations)} relacji między dokumentami.")

Wczytano 149 dokumentów do grafu wiedzy.
Utworzono 192 tematów.
Utworzono 967853 relacji między dokumentami.


In [None]:
# Testowe zapytania

# Statystyki
years_with_docs = [y for y in kg.by_year.keys()]
print(f"Zakres lat: {min(years_with_docs)} - {max(years_with_docs)}")

# Top 10 subjects
top_subjects = sorted(kg.subjects.items(),
                     key=lambda x: len(x[1].documents),
                     reverse=True)[:10]
print("\nTop 10 subjects:")
for subj_key, subj in top_subjects:
    print(f"  {subj.name}: {len(subj.documents)} dokumentów")

# Przykład: dokumenty z 20 wieku
docs_20c = kg.by_century.get(20, [])
print(f"\nDokumenty z XX wieku.: {len(docs_20c)}")

# Przykład: powiązane dokumenty
if kg.documents:
    first_doc_id = list(kg.documents.keys())[0]
    related = kg.get_related_documents(first_doc_id, max_results=3)
    print(f"\nDokumenty powiązane z '{kg.documents[first_doc_id].title}':")
    for doc, score in related:
        print(f"  - {doc.title} (score: {score:.2f})")

Zakres lat: 1859 - 2022

Top 10 subjects:
  duchowość: 126 dokumentów
  nabożeństwo żałobne: 126 dokumentów
  pogrzeb: 126 dokumentów
  religia: 126 dokumentów
  socjologia: 126 dokumentów
  społeczeństwo: 126 dokumentów
  druki ulotne 20 w.: 122 dokumentów
  Kraków (Polska ; region): 108 dokumentów
  druki ulotne 19 w.: 17 dokumentów
  druki ulotne 21 w.: 10 dokumentów

Dokumenty z XX wieku.: 120

Dokumenty powiązane z '„Your Future”':
  - Błażej Gastoł, doktór medycyny, profesor nadzwyczajny higieny, kierownik Katedry i Zakładu Higieny Akademii Medycznej w Krakowie [...] urodzony 15 lutego 1905 r. w Bieżanowie, zmarł w Krakowie dnia 6 grudnia 1966 r. [...] (score: 8.50)
  - Ś. P. Zofia Gąsiorowska, emer. nauczycielka [...] przeżywszy lat 76 [...] zmarła dnia 8 sierpnia 1961 r. [...] (score: 8.50)
  - Ś. P. Julian Gałek [...] zmarł dnia 3 stycznia 1960 roku [...] (score: 8.50)


In [None]:
def export_kg_to_jsonld(kg: KnowledgeGraph, output_file: str):
    context = {
    "@context": {
        "@vocab": "http://jbc.bj.uj.edu.pl/vocab/",
        "dc": "http://purl.org/dc/elements/1.1/",
        "title": "dc:title",
        "subject": "dc:subject",
        "date": "dc:date",
        "description": "dc:description",
        "identifier": "dc:identifier",
        "creator": "dc:creator",
        "publisher": "dc:publisher",
        "type": "dc:type",
        "hasRelation": "http://jbc.bj.uj.edu.pl/vocab/hasRelation",
        "relationType": "http://jbc.bj.uj.edu.pl/vocab/relationType",
        "relatedTo": "http://jbc.bj.uj.edu.pl/vocab/relatedTo",
        "weight": "http://jbc.bj.uj.edu.pl/vocab/weight",
        "year": "http://jbc.bj.uj.edu.pl/vocab/year",
        "century": "http://jbc.bj.uj.edu.pl/vocab/century",
    }
}

    documents = []
    for doc in kg.documents.values():
        doc_obj = {
            "@id": doc.identifier,
            "@type": "Document",
            "title": doc.title,
            "description": doc.description,
            "date": doc.date_raw,
            "year": doc.year,
            "century": doc.century,
            "subject": doc.subjects,
            "creator": doc.creator,
            "publisher": doc.publisher,
            "type": doc.type,
            "hasRelation": []
        }

        for rel in kg.relations:
            if rel.source_id == doc.identifier:
                doc_obj["hasRelation"].append({
                    "@type": "Relation",
                    "relationType": rel.relation_type,
                    "relatedTo": rel.target_id,
                    "weight": rel.weight,
                })
            elif rel.target_id == doc.identifier:
                doc_obj["hasRelation"].append({
                    "@type": "Relation",
                    "relationType": rel.relation_type,
                    "relatedTo": rel.source_id,
                    "weight": rel.weight,
                })
        documents.append(doc_obj)


    subjects = []
    for subj_key, subj in kg.subjects.items():
        subj_obj = {
            "@id": f"http://jbc.bj.uj.edu.pl/subject/{subj_key}",
            "@type": "Subject",
            "name": subj.name,
            "documents": subj.documents,
        }
        subjects.append(subj_obj)

    graph = {
        "@context": context["@context"],
        "@graph": documents + subjects
    }

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(graph, f, ensure_ascii=False, indent=2)

    print(f"Zapisano graf do {output_file}")
    print(f"  - Dokumentów: {len(documents)}")
    print(f"  - Subjects: {len(subjects)}")
    print(f"  - Relacji: {len(kg.relations)}")

    return graph

In [40]:
jsonld_graph = export_kg_to_jsonld(kg, "data/jbc_knowledge_graph.jsonld")

Zapisano graf do data/jbc_knowledge_graph.jsonld
  - Dokumentów: 149
  - Subjects: 192
  - Relacji: 967853


In [None]:
print("\nPrzykładowy dokument w JSON-LD:")
print(json.dumps(jsonld_graph["@graph"][0], ensure_ascii=False, indent=2))