# DB transformation

In [1]:
import json
import sqlite3
import re

def get_paragraph_number(text):
  pattern = r"^(\d+)\."
  match = re.match(pattern, text)
  return int(match.group(1)) if match else None

def get_paragraphs(data: list[dict]):
    paragraphs = {}
    def get_paragraphs_rec(data):
        paragraph_number = get_paragraph_number(data["content"])
        if paragraph_number and paragraph_number not in paragraphs:
            paragraph = data["content"]
            paragraphs[paragraph_number] = paragraph
        for e in data["elements"]:
            res = get_paragraphs_rec(e)
            if res:
                return res
        return None
    for d in data:
        get_paragraphs_rec(d)

    return paragraphs

db_path = "data/echr_2_0_0.db"
conn = sqlite3.connect(db_path)
c = conn.cursor()

cases = c.execute("SELECT * FROM 'case'")
cases = cases.fetchall()

cases_map = {} # maps case_id to paragraph_number to paragraph_text

for case in cases:
    case_id = case[0]
    data = case[-1]
    data = json.loads(data)
    paragraphs = get_paragraphs(data)
    cases_map[case_id] = paragraphs

Now we collect all case ids from the echr qa dataset


In [2]:
import pandas as pd
df = pd.read_csv("data/echr_qa_dataset.csv")
df

Unnamed: 0,question,answer,guide,paragraphs,citations,prompt_id,answer_no_citations
0,How does the Court determine whether a surveil...,Having regard to the structure of this provisi...,guide_terrorism_eng,"[2, 3, 4, 5, 8, 12]","[\n {\n ""case_name"": ""murray v. the ...",legal-sentence-level-cot-with-search-v2,Having regard to the structure of this provisi...
1,How does the margin of appreciation apply in t...,The Court went on to analyse the powers in que...,guide_terrorism_eng,"[36, 37, 38, 39, 41, 29]","[\n {\n ""case_name"": ""Mehmet Hasan A...",legal-sentence-level-cot-with-search-v1,The Court went on to analyse the powers in que...
2,How does the Court determine the applicability...,Noting that no remedial measures had been take...,guide_terrorism_eng,"[67, 68, 36, 70, 71, 72, 77]","[\n {\n ""case_name"": ""mehmet duman v...",legal-sentence-level-cot-with-search-v2,Noting that no remedial measures had been take...
3,"In a case involving security concerns, where t...",The total or partial exclusion of the public f...,guide_terrorism_eng,"[71, 73, 74, 75, 76, 126]","[\n {\n ""case_name"": ""krestovskiy v....",legal-sentence-level-cot-with-search-v2,The total or partial exclusion of the public f...
4,"Based on the Tysiac case, how does the absence...",The Court has held that a timely procedure sho...,guide_social_rights_eng,"[18, 19, 20, 21, 22, 25]","[\n {\n ""case_name"": ""tysi\u0105c v....",legal-sentence-level-cot-with-search-v2,The Court has held that a timely procedure sho...
...,...,...,...,...,...,...,...
1111,How does the Court ensure that domestic measur...,As regards the implementation of the right to ...,guide_art_3_protocol_4_eng,"[99, 100, 101, 75, 81, 83, 84]","[\n {\n ""case_name"": ""h.f. and other...",legal-sentence-level-cot-with-search-v2,As regards the implementation of the right to ...
1112,How does the Court ensure that the best intere...,"In other words, in order to assess the existen...",guide_art_3_protocol_4_eng,"[101, 102, 103, 82, 84, 94]","[\n {\n ""case_name"": ""h.f. and other...",legal-sentence-level-cot-with-search-v2,"In other words, in order to assess the existen..."
1113,"In the context of repatriation requests, what ...",There was no evidence that the refusals to rep...,guide_art_3_protocol_4_eng,"[103, 104, 105, 106, 82, 83, 84, 21]","[\n {\n ""case_name"": ""h.f. and other...",legal-sentence-level-cot-with-search-v2,There was no evidence that the refusals to rep...
1114,Under what circumstances does the extraterrito...,"Hirsi Jamaa and Others v. Italy [GC], 2012, co...",guide_art_4_protocol_4_eng,"[5, 6, 7, 8, 10, 11]","[\n {\n ""case_name"": ""hirsi jamaa an...",legal-sentence-level-cot-with-search-v2,"Hirsi Jamaa and Others v. Italy [GC], 2012, co..."


In [3]:
echr_qa_case_ids = set()

for i, row in df.iterrows():
    citations = json.loads(row["citations"])
    for citation in citations:
        echr_qa_case_ids.add(citation["case_id"])

print(len(echr_qa_case_ids))

1416


Now we retrieve all case ids that are not in echr open data

In [5]:
import requests

from bs4 import BeautifulSoup

def available_paragraphs(text: str):
    i = 1
    while f"\n{i}" in text:
        i += 1
    return i - 1

def get_paragraphs_for_case_id(case_id: str):
    url = f"https://hudoc.echr.coe.int/app/conversion/docx/html/body?library=ECHR&id={case_id}"
    res = requests.get(url)
    data = res.text

    soup = BeautifulSoup(data, "html.parser")

    text = soup.get_text(separator="\n")
    n = available_paragraphs(text)

    paragraphs = {}
    for i in range(1, n):
        _, _, after = text.partition(f"\n{i}")
        paragraph, _, text = after.partition(f"\n{i+1}")
        text = f"\n{i+1}" + text
        paragraphs[i] = re.sub(r'\s+', ' ', paragraph).strip()
    paragraphs[n] = text[0:600]
    return paragraphs

print(json.dumps(get_paragraphs_for_case_id("001-98238"), indent=4))

{
    "1": ". The applicants in the above two cases, listed in the appendix, are relatives of the victims of the hostage-taking in the \u201cDubrovka\u201d theatre in October 2002 in Moscow. Some of them were also personally among the hostages. The applicants in the first application are represented before the Court by Ms K. Moskalenko and Ms O. Mikhaylova, lawyers practising in Moscow. The applicants in the second application are represented before the Court by Mr Trunov and Ms Ayvar, lawyers practising in Moscow.",
    "2": ". The respondent Government were represented in both cases by Mr P. Laptev and Ms V. Milinchuk, former Representatives of the Russian Federation at the European Court of Human Rights. A. The circumstances of the case",
    "3": ". The facts of the above two cases are disputed between the parties. Their submissions may be summarised as follows. 1. Hostage-taking",
    "4": ". On the evening of 23 October 2002 a group of terrorists belonging to the Chechen separati

In [6]:
for case_id in echr_qa_case_ids:
    if case_id not in cases_map:
        paragraphs = get_paragraphs_for_case_id(case_id)
        print(f"Retrieved paragraphs for case_id {case_id}: {len(paragraphs)}")
        cases_map[case_id] = paragraphs

Retrieved paragraphs for case_id 001-75934: 99
Retrieved paragraphs for case_id 001-76586: 156
Retrieved paragraphs for case_id 001-194885: 53
Retrieved paragraphs for case_id 001-207928: 109
Retrieved paragraphs for case_id 001-114514: 175
Retrieved paragraphs for case_id 001-193543: 218
Retrieved paragraphs for case_id 001-217061: 142
Retrieved paragraphs for case_id 001-217565: 47
Retrieved paragraphs for case_id 001-196897: 113
Retrieved paragraphs for case_id 001-221542: 57
Retrieved paragraphs for case_id 001-216937: 113
Retrieved paragraphs for case_id 001-209750: 57
Retrieved paragraphs for case_id 001-212689: 45
Retrieved paragraphs for case_id 001-57660: 25
Retrieved paragraphs for case_id 001-97689: 193
Retrieved paragraphs for case_id 001-97900: 67
Retrieved paragraphs for case_id 001-228353: 75
Retrieved paragraphs for case_id 001-116416: 98
Retrieved paragraphs for case_id 001-217436: 65
Retrieved paragraphs for case_id 001-191117: 154
Retrieved paragraphs for case_id 001

We now create a df that has all paragraphs: (case_id, case_name, paragraph_number, paragraph)

In [7]:
case_name_map = {}

for case in cases:
    case_id = case[0]
    case_name = case[1]
    case_name_map[case_id] = case_name

print(len(cases_map))   

16258


In [12]:
def get_case_name(case_id: str):
    url = f'https://hudoc.echr.coe.int/app/query/results?query=((itemid%3A\"{case_id}\"))&select=sharepointid,rank,echrranking,languagenumber,itemid,docname,doctype,application,appno,conclusion,importance,originatingbody,typedescription,kpdate,kpdateastext,documentcollectionid,documentcollectionid2,languageisocode,extractedappno,isplaceholder,doctypebranch,respondent,advopidentifier,advopstatus,ecli,appnoparts,sclappnos,ECHRConcepts&sort=&start=0&length=20&rankingModelId=11111111-0000-0000-0000-000000000000'
    res = requests.get(url)
    data = res.json()
    return data["results"][0]["columns"]["docname"]


for case_id in echr_qa_case_ids:
    if case_id not in case_name_map:
        case_name = get_case_name(case_id)
        print(f"Retrieved case name for case_id {case_id}: {case_name}")
        case_name_map[case_id] = case_name

Retrieved case name for case_id 001-75934: CASE OF ZARB ADAMI v. MALTA
Retrieved case name for case_id 001-76586: WEBER AND SARAVIA v. GERMANY
Retrieved case name for case_id 001-194885: LARRAÑAGA ARANDO AND OTHERS v. SPAIN
Retrieved case name for case_id 001-207928: CASE OF BUDINOVA AND CHAPRAZOV v. BULGARIA
Retrieved case name for case_id 001-114514: CASE OF I.G. AND OTHERS v. SLOVAKIA
Retrieved case name for case_id 001-193543: CASE OF ILGAR MAMMADOV v. AZERBAIJAN
Retrieved case name for case_id 001-217061: CASE OF KHASANOV AND RAKHMANOV v. RUSSIA
Retrieved case name for case_id 001-217565: TERHEŞ v. ROMANIA
Retrieved case name for case_id 001-196897: CASE OF J.D. AND A v. THE UNITED KINGDOM
Retrieved case name for case_id 001-221542: CASE OF PERADZE AND OTHERS v. GEORGIA
Retrieved case name for case_id 001-216937: CASE OF BUMBEȘ v. ROMANIA
Retrieved case name for case_id 001-209750: PARFITT v. THE UNITED KINGDOM
Retrieved case name for case_id 001-212689: CASE OF KINDLHOFER v. AUST

Now we create our csv dataset

In [14]:
df = pd.DataFrame()

all_case_ids = []
all_case_names = []
all_paragraph_numbers = []
all_paragraph_texts = []

for case_id in cases_map.keys():
    case_name = case_name_map[case_id]
    case = cases_map[case_id]

    for paragraph_number in case.keys():
        all_case_ids.append(case_id)
        all_case_names.append(case_name)
        all_paragraph_numbers.append(paragraph_number)
        all_paragraph_texts.append(case[paragraph_number])

print(len(all_case_ids))
print(len(all_case_names))
print(len(all_paragraph_numbers))
print(len(all_paragraph_texts))

1024702
1024702
1024702
1024702


In [None]:
df["case_id"] = all_case_ids
df["case_name"] = all_case_names
df["paragraph_number"] = all_paragraph_numbers
df["paragraph_text"] = all_paragraph_texts

df.to_csv("data/echr_case_paragraphs.csv", index=False)