In [112]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import os
import json
import xml.etree.ElementTree as ET
from networkx.readwrite import graphml

### Datasets for KG creation

In [113]:
#Input triplets (taken from postprocessed documents)
datasets = {
    'train': 'postprocessed-dataset/train_chunks.json',
    'dev': 'postprocessed-dataset/dev_chunks.json',
    'test': 'postprocessed-dataset/test_chunks.json',
    'test_ood': 'postprocessed-dataset/test_ood_chunks.json' 
}
print("Datasets to be processed:", datasets.keys())

G = nx.MultiDiGraph()

Datasets to be processed: dict_keys(['train', 'dev', 'test', 'test_ood'])


### Function for entity stripping of label
triplets in this dataset follow the following structure:

$$[subject:label,relationship,object:label]$$

In [114]:
def parse_entity(e):
    if ":" in e:
        label, etype = e.split(":", 1)
    else:
        label, etype = e, None
    return label.strip(), etype.strip() if etype else None

### Reading datasets and KG creation
Each dataset contains the following fields:
- doc_id — Unique identifier for the source document.
- chunk_id — Identifier for the text chunk or segment within the document.
- relations — List of extracted relations, where each relation follows the structure:
    - subject:label, relationship, object:label


During processing, the system reads each entry and:
- Creates a node for every unique entity (subject and object).
- Creates an edge between those nodes representing the specified relationship.

In [115]:
for split, filepath in datasets.items():
    with open(filepath, 'r', encoding='utf-8') as f:
        print(f"Processing {filepath}...")
        data = json.load(f)
        for item in data:
            doc_id = item.get('doc_id')
            chunk_id = item.get('chunk_id')
            relations = item.get('relations', [])

            for i,(entity1, rel, entity2) in enumerate(relations):
                entity1_label, entity1_type = parse_entity(entity1)
                entity2_label, entity2_type = parse_entity(entity2)

                G.add_node(entity1_label, type=entity1_type)
                G.add_node(entity2_label, type=entity2_type)
                print
                
                #Because MultiDiGraph allows multiple edges between nodes, we create a unique key for each edge, 
                # if not gephi will warn that a key with a ID already exists.
                edge_key = f"{entity1_label}_{entity2_label}_{doc_id}_{chunk_id}_{i}"

                G.add_edge(
                    entity1_label,
                    entity2_label,
                    key=edge_key,
                    relation=rel,
                    doc_id=doc_id,
                    chunk_id=chunk_id
                )



Processing postprocessed-dataset/train_chunks.json...
Processing postprocessed-dataset/dev_chunks.json...
Processing postprocessed-dataset/test_chunks.json...
Processing postprocessed-dataset/test_ood_chunks.json...


### Saving knowledge graph

In [116]:
os.makedirs("Knowledge-graph", exist_ok=True)

graph_path = os.path.join("Knowledge-graph", "kg_total.graphml")
nx.write_graphml(G, graph_path)

print(f"Nodes: {len(G.nodes)}, Edges: {len(G.edges)}")

Nodes: 5640, Edges: 12083


In [117]:
def extract_kg(name,input_file):
    graph = nx.MultiDiGraph()
    with open(input_file, 'r', encoding='utf-8') as f:
        print(f"Processing {input_file}...")
        data = json.load(f)
        for item in data:
            doc_id = item.get('doc_id')
            chunk_id = item.get('chunk_id')
            relations = item.get('relations', [])

            for i,(entity1, rel, entity2) in enumerate(relations):
                entity1_label, entity1_type = parse_entity(entity1)
                entity2_label, entity2_type = parse_entity(entity2)

                graph.add_node(entity1_label, type=entity1_type)
                graph.add_node(entity2_label, type=entity2_type)
                print
                
                # Because MultiDiGraph allows multiple edges between nodes, we create a unique key for each edge, 
                # if not gephi will warn that a key with a ID already exists.
                edge_key = f"{entity1_label}_{entity2_label}_{doc_id}_{chunk_id}_{i}"

                graph.add_edge(
                    entity1_label,
                    entity2_label,
                    key=edge_key,
                    relation=rel,
                    doc_id=doc_id,
                    chunk_id=chunk_id
            )

    os.makedirs("Knowledge-graph", exist_ok=True)

    graph_path = os.path.join("Knowledge-graph", f"kg_{name}.graphml")
    nx.write_graphml(graph, graph_path)

    print(f"Nodes: {len(graph.nodes)}, Edges: {len(graph.edges)}")

In [118]:
extract_kg('train', datasets['train'])
extract_kg('dev', datasets['dev'])
extract_kg('test', datasets['test'])
extract_kg('test_ood', datasets['test_ood'])

Processing postprocessed-dataset/train_chunks.json...
Nodes: 4164, Edges: 8743
Processing postprocessed-dataset/dev_chunks.json...
Nodes: 646, Edges: 1132
Processing postprocessed-dataset/test_chunks.json...
Nodes: 969, Edges: 1626
Processing postprocessed-dataset/test_ood_chunks.json...
Nodes: 407, Edges: 582
