# DB transformation

In [None]:
import json
import sqlite3
import re

def get_paragraph_number(text):
  pattern = r"^(\d+)\."
  match = re.match(pattern, text)
  return int(match.group(1)) if match else None

def get_paragraphs(data: list[dict]):
    paragraphs = {}
    def get_paragraphs_rec(data):
        paragraph_number = get_paragraph_number(data["content"])
        if paragraph_number and paragraph_number not in paragraphs:
            paragraph = data["content"]
            paragraphs[paragraph_number] = paragraph
        for e in data["elements"]:
            res = get_paragraphs_rec(e)
            if res:
                return res
        return None
    for d in data:
        get_paragraphs_rec(d)

    return paragraphs

db_path = "data/echr_2_0_0.db"
conn = sqlite3.connect(db_path)
c = conn.cursor()

cases = c.execute("SELECT * FROM 'case'")
cases = cases.fetchall()

cases_map = {} # maps case_id to paragraph_number to paragraph_text

for case in cases:
    case_id = case[0]
    data = case[-1]
    data = json.loads(data)
    paragraphs = get_paragraphs(data)
    cases_map[case_id] = paragraphs

print(len(cases_map))

Now we collect all case ids from the echr qa dataset


In [None]:
import pandas as pd
df = pd.read_csv("data/echr_qa_dataset.csv")
df

In [None]:
echr_qa_case_ids = set()

for i, row in df.iterrows():
    citations = json.loads(row["citations"])
    for citation in citations:
        echr_qa_case_ids.add(citation["case_id"])

print(len(echr_qa_case_ids))

Now we retrieve all case ids that are not in echr open data

In [None]:
import requests

from bs4 import BeautifulSoup

def available_paragraphs(text: str):
    i = 1
    while f"\n{i}" in text:
        i += 1
    return i - 1

def get_paragraphs_for_case_id(case_id: str):
    url = f"https://hudoc.echr.coe.int/app/conversion/docx/html/body?library=ECHR&id={case_id}"
    res = requests.get(url)
    data = res.text

    soup = BeautifulSoup(data, "html.parser")

    text = soup.get_text(separator="\n")
    n = available_paragraphs(text)

    paragraphs = {}
    for i in range(1, n):
        _, _, after = text.partition(f"\n{i}")
        paragraph, _, text = after.partition(f"\n{i+1}")
        text = f"\n{i+1}" + text
        paragraphs[i] = re.sub(r'\s+', ' ', paragraph).strip()
    paragraphs[n] = text[0:600]
    return paragraphs

print(json.dumps(get_paragraphs_for_case_id("001-98238"), indent=4))

In [None]:
for case_id in echr_qa_case_ids:
    if case_id not in cases_map:
        paragraphs = get_paragraphs_for_case_id(case_id)
        print(f"Retrieved paragraphs for case_id {case_id}: {len(paragraphs)}")
        cases_map[case_id] = paragraphs

We now create a df that has all paragraphs: (case_id, case_name, paragraph_number, paragraph)

In [None]:
case_name_map = {}

for case in cases:
    case_id = case[0]
    case_name = case[1]
    case_name_map[case_id] = case_name

print(len(cases_map))   

In [None]:
def get_case_name(case_id: str):
    url = f'https://hudoc.echr.coe.int/app/query/results?query=((itemid%3A\"{case_id}\"))&select=sharepointid,rank,echrranking,languagenumber,itemid,docname,doctype,application,appno,conclusion,importance,originatingbody,typedescription,kpdate,kpdateastext,documentcollectionid,documentcollectionid2,languageisocode,extractedappno,isplaceholder,doctypebranch,respondent,advopidentifier,advopstatus,ecli,appnoparts,sclappnos,ECHRConcepts&sort=&start=0&length=20&rankingModelId=11111111-0000-0000-0000-000000000000'
    res = requests.get(url)
    data = res.json()
    return data["results"][0]["columns"]["docname"]


for case_id in echr_qa_case_ids:
    if case_id not in case_name_map:
        case_name = get_case_name(case_id)
        print(f"Retrieved case name for case_id {case_id}: {case_name}")
        case_name_map[case_id] = case_name

Now we create our csv dataset

In [None]:
df = pd.DataFrame()

all_case_ids = []
all_case_names = []
all_paragraph_numbers = []
all_paragraph_texts = []

for case_id in cases_map.keys():
    case_name = case_name_map[case_id]
    case = cases_map[case_id]

    for paragraph_number in case.keys():
        all_case_ids.append(case_id)
        all_case_names.append(case_name)
        all_paragraph_numbers.append(paragraph_number)
        all_paragraph_texts.append(case[paragraph_number])

print(len(all_case_ids))
print(len(all_case_names))
print(len(all_paragraph_numbers))
print(len(all_paragraph_texts))

In [None]:
df["case_id"] = all_case_ids
df["case_name"] = all_case_names
df["paragraph_number"] = all_paragraph_numbers
df["paragraph_text"] = all_paragraph_texts

df.to_csv("data/echr_case_paragraphs.csv", index=False)