In [11]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import os
import json
import xml.etree.ElementTree as ET
from networkx.readwrite import graphml
import random

### Datasets for KG creation

In [2]:
#Input triplets (taken from postprocessed documents)
datasets = {
    'train': 'postprocessed-dataset/train_chunks.json',
    'dev': 'postprocessed-dataset/dev_chunks.json',
    'test': 'postprocessed-dataset/test_chunks.json',
    'test_ood': 'postprocessed-dataset/test_ood_chunks.json' 
}
print("Datasets to be processed:", datasets.keys())

G = nx.MultiDiGraph()

Datasets to be processed: dict_keys(['train', 'dev', 'test', 'test_ood'])


### Function for entity stripping of label
triplets in this dataset follow the following structure:

$$[subject:label,relationship,object:label]$$

In [3]:
def parse_entity(e):
    if ":" in e:
        label, etype = e.split(":", 1)
    else:
        label, etype = e, None
    return label.strip(), etype.strip() if etype else None

### Reading datasets and KG creation
Each dataset contains the following fields:
- doc_id — Unique identifier for the source document.
- chunk_id — Identifier for the text chunk or segment within the document.
- relations — List of extracted relations, where each relation follows the structure:
    - subject:label, relationship, object:label


During processing, the system reads each entry and:
- Creates a node for every unique entity (subject and object).
- Creates an edge between those nodes representing the specified relationship.

In [4]:
for split, filepath in datasets.items():
    with open(filepath, 'r', encoding='utf-8') as f:
        print(f"Processing {filepath}...")
        data = json.load(f)
        for item in data:
            doc_id = item.get('doc_id')
            chunk_id = item.get('chunk_id')
            relations = item.get('relations', [])

            for i,(entity1, rel, entity2) in enumerate(relations):
                entity1_label, entity1_type = parse_entity(entity1)
                entity2_label, entity2_type = parse_entity(entity2)

                G.add_node(entity1_label, type=entity1_type)
                G.add_node(entity2_label, type=entity2_type)
                print
                
                #Because MultiDiGraph allows multiple edges between nodes, we create a unique key for each edge, 
                # if not gephi will warn that a key with a ID already exists.
                edge_key = f"{entity1_label}_{entity2_label}_{doc_id}_{chunk_id}_{i}"

                G.add_edge(
                    entity1_label,
                    entity2_label,
                    key=edge_key,
                    relation=rel,
                    doc_id=doc_id,
                    chunk_id=chunk_id
                )



Processing postprocessed-dataset/train_chunks.json...
Processing postprocessed-dataset/dev_chunks.json...
Processing postprocessed-dataset/test_chunks.json...
Processing postprocessed-dataset/test_ood_chunks.json...


### Saving knowledge graph

In [5]:
os.makedirs("Knowledge-graph", exist_ok=True)

graph_path = os.path.join("Knowledge-graph", "kg_total.graphml")
nx.write_graphml(G, graph_path)

print(f"Nodes: {len(G.nodes)}, Edges: {len(G.edges)}")

Nodes: 5640, Edges: 12083


In [6]:
def extract_kg(name,input_file):
    graph = nx.MultiDiGraph()
    with open(input_file, 'r', encoding='utf-8') as f:
        print(f"Processing {input_file}...")
        data = json.load(f)
        for item in data:
            doc_id = item.get('doc_id')
            chunk_id = item.get('chunk_id')
            relations = item.get('relations', [])

            for i,(entity1, rel, entity2) in enumerate(relations):
                entity1_label, entity1_type = parse_entity(entity1)
                entity2_label, entity2_type = parse_entity(entity2)

                graph.add_node(entity1_label, type=entity1_type)
                graph.add_node(entity2_label, type=entity2_type)
                print
                
                # Because MultiDiGraph allows multiple edges between nodes, we create a unique key for each edge, 
                # if not gephi will warn that a key with a ID already exists.
                edge_key = f"{entity1_label}_{entity2_label}_{doc_id}_{chunk_id}_{i}"

                graph.add_edge(
                    entity1_label,
                    entity2_label,
                    key=edge_key,
                    relation=rel,
                    doc_id=doc_id,
                    chunk_id=chunk_id
            )

    os.makedirs("Knowledge-graph", exist_ok=True)

    graph_path = os.path.join("Knowledge-graph", f"kg_{name}.graphml")
    nx.write_graphml(graph, graph_path)

    print(f"Nodes: {len(graph.nodes)}, Edges: {len(graph.edges)}")

In [7]:
extract_kg('train', datasets['train'])
extract_kg('dev', datasets['dev'])
extract_kg('test', datasets['test'])
extract_kg('test_ood', datasets['test_ood'])

Processing postprocessed-dataset/train_chunks.json...
Nodes: 4164, Edges: 8743
Processing postprocessed-dataset/dev_chunks.json...
Nodes: 646, Edges: 1132
Processing postprocessed-dataset/test_chunks.json...
Nodes: 969, Edges: 1626
Processing postprocessed-dataset/test_ood_chunks.json...
Nodes: 407, Edges: 582


In [42]:
# I want to get the ID of all documents present in the train dataset
documents = set()

random.seed(42)

with open(datasets['train'], 'r', encoding='utf-8') as f:
    data = json.load(f)
    for item in data:
        doc_id = item.get('doc_id')
        documents.add(doc_id)


print(f"Unique document IDs in train dataset: {documents}")
print(f"Total unique document IDs in train dataset: {len(documents)}")

ran_docs = random.sample(sorted(documents), 5)
print(f"Random sample of 5 document IDs from train dataset: {ran_docs}")

graphs = {}
contents = {}


with open(datasets['train'], 'r', encoding='utf-8') as f:
    data = json.load(f)
    for item in data:
        doc_id = item.get('doc_id')

        if doc_id in ran_docs:
            # Create the document graph if it doesn't exist
            if doc_id not in graphs:
                graphs[doc_id] = nx.MultiDiGraph()
                contents[doc_id] = []

            doc_graph = graphs[doc_id]

            chunk_id = item.get('chunk_id')
            relations = item.get('relations', [])
            content = item.get('content', [])

            # saves the chunk content
            if isinstance(content, list):
                contents[doc_id].extend(content)
            else:
                contents[doc_id].append(content)

            # Adds entities and relations to the graph
            for i, (entity1, rel, entity2) in enumerate(relations):
                entity1_label, entity1_type = parse_entity(entity1)
                entity2_label, entity2_type = parse_entity(entity2)

                doc_graph.add_node(entity1_label, type=entity1_type)
                doc_graph.add_node(entity2_label, type=entity2_type)

                edge_key = f"{entity1_label}_{entity2_label}_{doc_id}_{chunk_id}_{i}"

                doc_graph.add_edge(
                    entity1_label,
                    entity2_label,
                    key=edge_key,
                    relation=rel,
                    doc_id=doc_id,
                    chunk_id=chunk_id
                )

# Save each document's graph and content
for doc_id, doc_graph in graphs.items():
    doc_dir = os.path.join("Knowledge-graph", f"doc_{doc_id}")
    os.makedirs(doc_dir, exist_ok=True)

    # Save KG
    graph_path = os.path.join(doc_dir, f"kg_{doc_id}.graphml")
    nx.write_graphml(doc_graph, graph_path)

    # Save content
    content_path = os.path.join(doc_dir, f"content_{doc_id}.txt")
    with open(content_path, "w", encoding="utf-8") as cf:
        cf.write("\n".join(contents[doc_id]))

    print(f"Saved graph and content for doc_id={doc_id}: "
          f"Nodes={len(doc_graph.nodes)}, Edges={len(doc_graph.edges)}, "
          f"Chunks={len(contents[doc_id])}")

        


Unique document IDs in train dataset: {'52910494', '198897554', '60440450', '202888986', '4246700', '210713911', '204402755', '202565512', '210839714', '6116678', '3920676', '209386851', '210164920', '204901567', '202719032', '23569888', '21683040', '59599694', '52009210', '24972096', '104291983', '195347056', '198231883', '209862890', '102351044', '28984897', '211010520', '201124533', '208513596', '199543700', '150374036', '209532167', '210861282', '207880647', '52180375', '211010758', '210860962', '35249701', '22825560', '202734316', '53719742', '202888751', '211004033', '208202241', '210860760', '202750230', '202734254', '51559', '201070697', '53731879', '211020570', '199668978', '147703932', '208548469', '201070522', '198147921', '67855714', '202676714', '4539700', '153312532', '146808333', '201646309', '146120936', '7507210', '210164517', '210920315', '51876625', '202540251', '202577400', '210839545', '4319457', '44148233', '56657874', '51923817', '11241677', '211010786', '5444710