Imports

In [13]:
import json
import networkx as nx
from itertools import combinations
import math
import networkx as nx

import random
from collections import Counter
from concurrent.futures import ProcessPoolExecutor, as_completed

Settings

In [38]:
DATA_TRAIN_PATH = "lawDB.CH_graph_building_set_EN.json"
DATA_EVAL_PATH = "lawDB.CH_eval_set_EN.json"

EDGE_FILTER_THRESHOLD = 3
CONTEXT_SIZE = 15

Datasets Loading

In [51]:
import zipfile

with zipfile.ZipFile("dataset.zip", "r") as z:
     z.extractall()

In [52]:
with open(DATA_TRAIN_PATH, encoding='utf8') as f:
    raw_train = json.load(f)
    print(f"Co-occurrence graph construction set: {len(raw_train)} instances")

with open(DATA_EVAL_PATH, encoding='utf8') as e:
    raw_eval = json.load(e)
    print(f"Evaluation set: {len(raw_eval)} instances")

Co-occurrence graph construction set: 87110 instances
Evaluation set: 123 instances


In [53]:
raw_train[0]
#raw_eval[0]

{'_id': '5Co/157/2016__2017-06-15__ECLI:SK:KSPO:2017:8112219708.2',
 'train_laws': ['40/1964/paragraph-52.section-1',
  '40/1964/paragraph-52.section-2',
  '40/1964/paragraph-52.section-3',
  '40/1964/paragraph-53.section-1',
  '40/1964/paragraph-54',
  '40/1964/paragraph-39',
  '40/1964/paragraph-524.section-1',
  '40/1964/paragraph-524.section-2',
  '40/1964/paragraph-558',
  '40/1964/paragraph-565',
  '40/1964/paragraph-53.section-1',
  '40/1964/paragraph-54',
  '40/1964/paragraph-39',
  '40/1964/paragraph-524.section-1',
  '40/1964/paragraph-524.section-2',
  '40/1964/paragraph-558',
  '40/1964/paragraph-565',
  '40/1964/paragraph-54',
  '40/1964/paragraph-39',
  '40/1964/paragraph-524.section-1',
  '40/1964/paragraph-524.section-2',
  '40/1964/paragraph-558',
  '40/1964/paragraph-565',
  '40/1964/paragraph-39',
  '40/1964/paragraph-524.section-1',
  '40/1964/paragraph-524.section-2',
  '40/1964/paragraph-558',
  '40/1964/paragraph-565',
  '40/1964/paragraph-524.section-1',
  '40/1

Graph Constraction

In [47]:
# Create graph
G = nx.Graph()

for i, decision in enumerate(raw_train[:]):
    if i%10000==0:print(f"processed {i} decisions!")
    # Clean first, then deduplicate
    laws = {law for law in decision["train_laws"]}

    # Add edges between all pairs of co-occurring laws
    for law1, law2 in combinations(laws, 2):
        if G.has_edge(law1, law2):
            G[law1][law2]['weight'] += 1
        else:
            G.add_edge(law1, law2, weight=1)

print(f"\nNodes: {G.number_of_nodes()}")
print(f"Edges: {G.number_of_edges()}")

total_weight = sum(nx.get_edge_attributes(G, "weight").values())
avg_weight = total_weight / G.number_of_edges()

print(f"Average edge weight: {avg_weight:.2f}")

 # in case of raw weight usage
for u, v, d in G.edges(data=True):
    d['inv_weight'] = 1/d['weight']




processed 0 decisions!
processed 10000 decisions!
processed 20000 decisions!
processed 30000 decisions!
processed 40000 decisions!
processed 50000 decisions!
processed 60000 decisions!
processed 70000 decisions!
processed 80000 decisions!

Nodes: 58301
Edges: 5985508
Average edge weight: 8.93


Filtering

In [48]:
# --- Remove edges with weight < EDGE_FILTER_THRESHOLD ---
edges_to_remove = [(u, v) for u, v, d in G.edges(data=True) if d['weight'] < EDGE_FILTER_THRESHOLD]
G.remove_edges_from(edges_to_remove)

# --- Remove isolated nodes (degree 0) ---
isolated_nodes = list(nx.isolates(G))
G.remove_nodes_from(isolated_nodes)

print(f"Nodes after filter: {G.number_of_nodes()}")
print(f"Edges after filter: {G.number_of_edges()}")

Nodes after filter: 20460
Edges after filter: 1453608


Method - Main - Evaluation

In [49]:
def predict_choice_context(laws_before, laws_after, choice_options, G, context_size=None):
    if context_size is not None:
        laws_before = laws_before[:context_size]
        laws_after = laws_after[:context_size]

    choice_options = list(choice_options)
    context_laws = {law for law in laws_before + laws_after if law not in choice_options}
    print("context_laws:", list(context_laws)[:5])

    # keep only context nodes that exist in the graph
    context_nodes = [law for law in context_laws if law in G]

    # compute shortest path lengths from multiple sources
    if context_nodes:
        all_distances = nx.multi_source_dijkstra_path_length(G, context_nodes, weight="inv_weight")
    else:
        all_distances = {}

    # compute inverse-distance scores
    scores = {}
    for option in choice_options:
        d = all_distances.get(option, float("inf")) if option in G else float("inf")
        scores[option] = 0.0 if d == float("inf") else 1.0 / (d + 1e-6)

    # normalize to [0,1], sum = 1
    total = sum(scores.values())
    if total > 0:
        confidences = {opt: s / total for opt, s in scores.items()}
    else:
        confidences = {opt: 0.0 for opt in choice_options}

    # pick best
    predicted = max(confidences, key=confidences.get)

    print("confidence vector:", confidences)
    return predicted, confidences


correct = 0
total = 0

records = []  # store stats for later analysis

for idx, entry in enumerate(raw_eval[:], start=1):  # subset for testing
    predicted, confidences = predict_choice_context(
        laws_before=entry['laws_before'],
        laws_after=entry['laws_after'],
        choice_options=entry['candidates'],
        G=G,
        context_size=CONTEXT_SIZE,
    )

    true_choice = entry['correct']
    is_correct = predicted == true_choice
    conf_pred = confidences[predicted]  # model’s confidence in its prediction

    # save stats
    records.append({
        "decision_id": entry['decision_id'],
        "predicted": predicted,
        "true": true_choice,
        "confidence": conf_pred,
        "correct": is_correct
    })

    # update counters
    correct += int(is_correct)
    total += 1
    running_acc = correct / total  # running accuracy

    print(f"[{idx}] ID={entry['decision_id']}\n"
          f"Pred={predicted}\nTrue={true_choice}\nConf={conf_pred:.3f}\n"
          f"{'✅' if is_correct else '❌'} | Running Acc={running_acc:.3f}")
    print()
print(f"Final Accuracy: {correct / total:.3f}")

context_laws: ['233/1995/paragraph-44.section-3', '40/1964/paragraph-54.section-1', '40/1964/paragraph-53.section-1', '40/1964/paragraph-52.section-1', '233/1995/paragraph-41.section-2.letter-d']
confidence vector: {'244/2002/paragraph-34': 0.9919127793773755, '7/2005/paragraph-34': 0.008087220622624413}
[1] ID=14CoE/452/2015__2016-02-05__ECLI:SK:KSKE:2016:7814204999.1
Pred=244/2002/paragraph-34
True=244/2002/paragraph-34
Conf=0.992
✅ | Running Acc=1.000

context_laws: ['233/1995/paragraph-44.section-3', '40/1964/paragraph-54.section-1', '40/1964/paragraph-53.section-1', '40/1964/paragraph-52.section-1', '233/1995/paragraph-41.section-2.letter-d']
confidence vector: {'244/2002/paragraph-45': 0.9915348982106844, '7/2005/paragraph-45': 0.008465101789315534}
[2] ID=14CoE/452/2015__2016-02-05__ECLI:SK:KSKE:2016:7814204999.1
Pred=244/2002/paragraph-45
True=244/2002/paragraph-45
Conf=0.992
✅ | Running Acc=1.000

context_laws: ['99/1963/paragraph-42.section-3', '40/1964/paragraph-365.section-