## Dataset Filtering

This notebook is used to apply additional filtering criteria to our ECHR-QA Dataset.
Each ECHR-QA pair must:
- Have at least one citation
- All passages of the citation must be retrievable (and will be saved with the citation)
- Questions must not mention a case

Final Result:
- Each ECHR-QA pair entry consist of (question, answer, citations) where the citations are all cited cases in the answer.

General strategy:
- We parse the case law guides on a sentence level and save the citations for each sentence (guide_id, paragraph, sentence, citations)
- We attempt to retrieve the citations for each sentence so we know which sentences are usable
- We attempt to create a QA pair based on the entire case law guides 
- We filter out generated QA pairs that do not meet all of the criteria mentioned above

In [2]:
import pandas as pd

sentences_df = pd.read_csv('data/sentences_with_citations_usable.csv')
sentences_df

Unnamed: 0,guide_id,paragraph,sentence,citations,usable
0,guide_art_1_eng,1,"As provided by Article 1, the engagement under...",[],True
1,guide_art_1_eng,1,“Jurisdiction” within the meaning of Article 1...,[],True
2,guide_art_1_eng,1,The exercise of jurisdiction is a necessary co...,"[{""case_name"": ""catan and others v. the republ...",True
3,guide_art_1_eng,2,"In the Convention context, the term jurisdicti...",[],True
4,guide_art_1_eng,2,The first corresponds to the Court’s own juris...,[],True
...,...,...,...,...,...
20586,guide_terrorism_eng,130,It transpires from the Court’s case-law that t...,"[{""case_name"": ""lawless v. ireland (no. 3)"", ""...",True
20587,guide_terrorism_eng,130,That danger must be ongoing or imminent.,[],True
20588,guide_terrorism_eng,130,A crisis affecting only one region of a State ...,"[{""case_name"": ""ireland v. the united kingdom""...",False
20589,guide_terrorism_eng,131,The Court had ruled that terrorism in Northern...,[],True


In [3]:
from pydantic import BaseModel


class Citation(BaseModel):
    case_name: str
    case_id: str
    paragraph_numbers: list[int]
    paragraphs_map: dict[int, str] | None = {}

def clean_case_id(case_id: str):
    if "(F)" in case_id:
        raise ValueError(f"Case id contains (F): {case_id}")
    if "unknown" in case_id.lower():
        raise ValueError(f"Case id unknown: {case_id}")
    if "(" in case_id:
        case_id = case_id.split("(")[0]
    return case_id

def merge_citations(list_of_citations: list[Citation]):
    merged = {clean_case_id(c.case_id): Citation(case_name=c.case_name, case_id=clean_case_id(c.case_id), paragraph_numbers=[]) for c in list_of_citations}
    for c in list_of_citations:
        merged[clean_case_id(c.case_id)].paragraph_numbers.extend(c.paragraph_numbers)
        merged[clean_case_id(c.case_id)].paragraph_numbers = list(set(merged[clean_case_id(c.case_id)].paragraph_numbers))
        merged[clean_case_id(c.case_id)].paragraph_numbers.sort()
        
    return list(merged.values())

In [4]:
# test merge citations
import json


list_of_citations_1 = [{"case_name": "ila\u015fcu and others v. moldova and russia", "case_id": "001-61886", "paragraph_numbers": [348, 349, 350, 351, 352]}, {"case_name": "ivan\u0163oc and others v. moldova and russia", "case_id": "001-107480", "paragraph_numbers": [111]}, {"case_name": "Catan and Others v. the Republic of Moldova and Russia", "case_id": "001-114082", "paragraph_numbers": [148]}]
list_of_citations_1 = [Citation(**c) for c in list_of_citations_1]
list_of_citations_2 = [{"case_name": "mamasakhlisi and others v. georgia and russia", "case_id": "001-223361", "paragraph_numbers": [398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410]}]
list_of_citations_2 = [Citation(**c) for c in list_of_citations_2]
list_of_citations = list_of_citations_1 + list_of_citations_2
merged = merge_citations(list_of_citations)
print(json.dumps([m.model_dump() for m in merged], indent=2))

[
  {
    "case_name": "ila\u015fcu and others v. moldova and russia",
    "case_id": "001-61886",
    "paragraph_numbers": [
      348,
      349,
      350,
      351,
      352
    ],
    "paragraphs_map": {}
  },
  {
    "case_name": "ivan\u0163oc and others v. moldova and russia",
    "case_id": "001-107480",
    "paragraph_numbers": [
      111
    ],
    "paragraphs_map": {}
  },
  {
    "case_name": "Catan and Others v. the Republic of Moldova and Russia",
    "case_id": "001-114082",
    "paragraph_numbers": [
      148
    ],
    "paragraphs_map": {}
  },
  {
    "case_name": "mamasakhlisi and others v. georgia and russia",
    "case_id": "001-223361",
    "paragraph_numbers": [
      398,
      399,
      400,
      401,
      402,
      403,
      404,
      405,
      406,
      407,
      408,
      409,
      410
    ],
    "paragraphs_map": {}
  }
]


In [5]:
import spacy


nlp = spacy.load("en_core_web_trf")


def get_sentences_spacy(text: str):
    doc = nlp(text)
    return [sentence.text for sentence in doc.sents]

In [6]:
echr_qa_dataset_df = pd.read_csv('data/echr_qa_dataset_v4.csv')

In [7]:
import re

def get_paragraph_number(text):
  pattern = r"^(\d+)\."
  match = re.match(pattern, text)
  return int(match.group(1)) if match else None

def get_paragraphs(data: list[dict]):
    paragraphs = {}
    def get_paragraphs_rec(data):
        paragraph_number = get_paragraph_number(data["content"])
        if paragraph_number and paragraph_number not in paragraphs:
            paragraph = data["content"]
            paragraphs[paragraph_number] = paragraph
        for e in data["elements"]:
            res = get_paragraphs_rec(e)
            if res:
                return res
        return None
    for d in data:
        get_paragraphs_rec(d)

    return paragraphs

import json
import sqlite3


db_path = "data/echr_2_0_0.db"
conn = sqlite3.connect(db_path)
c = conn.cursor()

cases = c.execute("SELECT * FROM 'case'")
cases = cases.fetchall()

cases_map = {} # maps case_id to paragraph_number to paragraph_text

for case in cases:
    case_id = case[0]
    data = case[-1]
    data = json.loads(data)
    paragraphs = get_paragraphs(data)
    # print(json.dumps(paragraphs, indent=4))
    cases_map[case_id] = paragraphs

In [8]:
import requests

from bs4 import BeautifulSoup

def available_paragraphs(text: str):
    i = 1
    while f"\n{i}" in text:
        i += 1
    return i - 1

def get_paragraphs_for_case_id(case_id: str):
    url = f"https://hudoc.echr.coe.int/app/conversion/docx/html/body?library=ECHR&id={case_id}"
    res = requests.get(url)
    data = res.text

    soup = BeautifulSoup(data, "html.parser")

    text = soup.get_text(separator="\n")
    n = available_paragraphs(text)

    paragraphs = {}
    for i in range(1, n):
        _, _, after = text.partition(f"\n{i}")
        paragraph, _, text = after.partition(f"\n{i+1}")
        text = f"\n{i+1}" + text
        paragraphs[i] = re.sub(r'\s+', ' ', paragraph).strip()
    paragraphs[n] = text[0:600]
    return paragraphs

print(json.dumps(get_paragraphs_for_case_id("001-98238"), indent=4))

{
    "1": ". The applicants in the above two cases, listed in the appendix, are relatives of the victims of the hostage-taking in the \u201cDubrovka\u201d theatre in October 2002 in Moscow. Some of them were also personally among the hostages. The applicants in the first application are represented before the Court by Ms K. Moskalenko and Ms O. Mikhaylova, lawyers practising in Moscow. The applicants in the second application are represented before the Court by Mr Trunov and Ms Ayvar, lawyers practising in Moscow.",
    "2": ". The respondent Government were represented in both cases by Mr P. Laptev and Ms V. Milinchuk, former Representatives of the Russian Federation at the European Court of Human Rights. A. The circumstances of the case",
    "3": ". The facts of the above two cases are disputed between the parties. Their submissions may be summarised as follows. 1. Hostage-taking",
    "4": ". On the evening of 23 October 2002 a group of terrorists belonging to the Chechen separati

In [9]:
import copy

def retrieve_citations(citations: list[Citation]):
    citations = copy.deepcopy(citations)
    error = False
    
    for citation in citations:
        case_id = citation.case_id

        if "(F)" in case_id or "unknown" in case_id.lower() or "UNIDENTIFIABLE" in case_id:
            error = True
            continue

        case_id = clean_case_id(case_id)

        retrieved_case = False
        if case_id not in cases_map:
            # we attempt to retrieve the paragraphs from the web and update our map
            retrieved_case = True
            paragraphs = get_paragraphs_for_case_id(case_id)
            if not paragraphs:
                print(f"Failed retrieving paragraphs: https://hudoc.echr.coe.int/app/conversion/docx/html/body?library=ECHR&id={case_id}")
                error = True
            
            # we save it anyway as we don't want to keep trying to retrieve it
            cases_map[case_id] = paragraphs
        else: 
            paragraphs = cases_map[case_id]
        
        paragraphs_map = {}
        for paragraph_number in citation.paragraph_numbers:
            if paragraph_number not in paragraphs:
                print(f'Paragraph {paragraph_number} not found for{" (retrieved)" if retrieved_case else ""}: https://hudoc.echr.coe.int/eng?i={case_id}')
                error = True
                break
            paragraphs_map[paragraph_number] = paragraphs[paragraph_number]

        citation.paragraphs_map = paragraphs_map
    return citations, error

In [137]:
# ! only run if needed !

usable = []

for i, row in sentences_df.iterrows():
    citations = json.loads(row["citations"])
    citations = [Citation(**c) for c in citations]
    citations, error = retrieve_citations(citations)
    if error:
        usable.append(False)
    else:
        usable.append(True)

sentences_df["usable"] = usable

sentences_df    

Paragraph 45 not found for (retrieved): https://hudoc.echr.coe.int/eng?i=001-141152
Paragraph 22 not found for (retrieved): https://hudoc.echr.coe.int/eng?i=001-186258
Paragraph 25 not found for (retrieved): https://hudoc.echr.coe.int/eng?i=001-208054
Paragraph 84 not found for (retrieved): https://hudoc.echr.coe.int/eng?i=001-49227
Paragraph 133 not found for (retrieved): https://hudoc.echr.coe.int/eng?i=001-168350
Paragraph 243 not found for: https://hudoc.echr.coe.int/eng?i=001-172518
Paragraph 206 not found for: https://hudoc.echr.coe.int/eng?i=001-172518
Paragraph 243 not found for: https://hudoc.echr.coe.int/eng?i=001-172518
Paragraph 38 not found for: https://hudoc.echr.coe.int/eng?i=001-229897
Paragraph 64 not found for (retrieved): https://hudoc.echr.coe.int/eng?i=001-145055
Paragraph 171 not found for (retrieved): https://hudoc.echr.coe.int/eng?i=001-108383
Paragraph 65 not found for: https://hudoc.echr.coe.int/eng?i=001-145055
Paragraph 63 not found for: https://hudoc.echr.c

Unnamed: 0,guide_id,paragraph,sentence,citations,usable
0,guide_art_1_eng,1,"As provided by Article 1, the engagement under...",[],True
1,guide_art_1_eng,1,“Jurisdiction” within the meaning of Article 1...,[],True
2,guide_art_1_eng,1,The exercise of jurisdiction is a necessary co...,"[{""case_name"": ""catan and others v. the republ...",True
3,guide_art_1_eng,2,"In the Convention context, the term jurisdicti...",[],True
4,guide_art_1_eng,2,The first corresponds to the Court’s own juris...,[],True
...,...,...,...,...,...
20586,guide_terrorism_eng,130,It transpires from the Court’s case-law that t...,"[{""case_name"": ""lawless v. ireland (no. 3)"", ""...",True
20587,guide_terrorism_eng,130,That danger must be ongoing or imminent.,[],True
20588,guide_terrorism_eng,130,A crisis affecting only one region of a State ...,"[{""case_name"": ""ireland v. the united kingdom""...",False
20589,guide_terrorism_eng,131,The Court had ruled that terrorism in Northern...,[],True


In [10]:
# count the true and false values in the usable column
print(sentences_df["usable"].value_counts())

usable
True     17657
False     2934
Name: count, dtype: int64


In [11]:
def get_citations(sentences: list[str]):
    # only returns citations if they are valid, else none
    citations = []
    for sentence in sentences:
        find_sentence_df = sentences_df[sentences_df['sentence'] == sentence]

        if len(find_sentence_df) == 0:
            print("Sentence not found in guide, removing pair...")
            return None
            
        sentence_citations = json.loads(find_sentence_df.iloc[0]["citations"])
        sentence_citations = [Citation(**c) for c in sentence_citations]

        for sc in sentence_citations:
            citations.append(sc)

    if find_sentence_df["usable"].values[0] == False:
        print("Invalid: Sentence not usable")
        return None
    
    merged_citations = merge_citations(citations)
    
    if not merged_citations:
        print("Invalid: No citations found")
        return None

    citations_without_paragraphs = [c for c in merged_citations if len(c.paragraph_numbers) == 0]
    if citations_without_paragraphs:
        print("Invalid: Found citation without paragraph")
        return None

    
    merged_citations, error = retrieve_citations(merged_citations)
    if error:
        print("Invalid: Error retrieving citations")
        return None
    
    return merged_citations

In [12]:
df = pd.DataFrame(columns=["question", "answer", "guide", "paragraphs", "citations", "prompt_id"])

for i, row in echr_qa_dataset_df.iterrows():
    answer = row['answer']
    if answer[0] == "[":
        sentences = json.loads(answer)
    else:
        print("Already processed")
        df = df._append({
            "question": row['question'],
            "answer": row['answer'],
            "guide": row['guide'],
            "paragraphs": row['paragraphs'],
            "citations": json.dumps(json.loads(row['citations']), indent=4),
            "prompt_id": row['prompt_id']
        }, ignore_index=True)
        continue
    
    citations = []

    if "v." in row['question'] or ".?" in row['question']:
        print("Invalid: Question contains v. or is not a question (.?)")
        continue

    try:
        merged_citations = get_citations(sentences)
        if not merged_citations:
            continue

        print("Found valid QA pair")
        df = df._append({
            "question": row['question'],
            "answer": " ".join(sentences),
            "guide": row['guide'],
            "paragraphs": row['paragraphs'],
            "citations": json.dumps([c.model_dump() for c in merged_citations], indent=4),
            "prompt_id": row['prompt_id']
        }, ignore_index=True)
    except Exception as e:
        print(f"Invalid: {e}")

print(len(df))

Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already processed
Already pr

In [13]:
df.to_csv('data/echr_qa_dataset_v4_with_citations.csv', index=False)

In [14]:
df = pd.read_csv('data/echr_qa_dataset_v4_with_citations.csv')
df

Unnamed: 0,question,answer,guide,paragraphs,citations,prompt_id
0,How does the Court determine whether a surveil...,Having regard to the structure of this provisi...,guide_terrorism_eng,"[2, 3, 4, 5, 8, 12]","[\n {\n ""case_name"": ""murray v. the ...",legal-sentence-level-cot-with-search-v2
1,How does the margin of appreciation apply in t...,The Court went on to analyse the powers in que...,guide_terrorism_eng,"[36, 37, 38, 39, 41, 29]","[\n {\n ""case_name"": ""Mehmet Hasan A...",legal-sentence-level-cot-with-search-v1
2,How does the Court determine the applicability...,Noting that no remedial measures had been take...,guide_terrorism_eng,"[67, 68, 36, 70, 71, 72, 77]","[\n {\n ""case_name"": ""mehmet duman v...",legal-sentence-level-cot-with-search-v2
3,"In a case involving security concerns, where t...",The total or partial exclusion of the public f...,guide_terrorism_eng,"[71, 73, 74, 75, 76, 126]","[\n {\n ""case_name"": ""krestovskiy v....",legal-sentence-level-cot-with-search-v2
4,"Based on the Tysiac case, how does the absence...",The Court has held that a timely procedure sho...,guide_social_rights_eng,"[18, 19, 20, 21, 22, 25]","[\n {\n ""case_name"": ""tysi\u0105c v....",legal-sentence-level-cot-with-search-v2
...,...,...,...,...,...,...
1131,How does the Court ensure that domestic measur...,As regards the implementation of the right to ...,guide_art_3_protocol_4_eng,"[99, 100, 101, 75, 81, 83, 84]","[\n {\n ""case_name"": ""h.f. and other...",legal-sentence-level-cot-with-search-v2
1132,How does the Court ensure that the best intere...,"In other words, in order to assess the existen...",guide_art_3_protocol_4_eng,"[101, 102, 103, 82, 84, 94]","[\n {\n ""case_name"": ""h.f. and other...",legal-sentence-level-cot-with-search-v2
1133,"In the context of repatriation requests, what ...",There was no evidence that the refusals to rep...,guide_art_3_protocol_4_eng,"[103, 104, 105, 106, 82, 83, 84, 21]","[\n {\n ""case_name"": ""h.f. and other...",legal-sentence-level-cot-with-search-v2
1134,Under what circumstances does the extraterrito...,"Hirsi Jamaa and Others v. Italy [GC], 2012, co...",guide_art_4_protocol_4_eng,"[5, 6, 7, 8, 10, 11]","[\n {\n ""case_name"": ""hirsi jamaa an...",legal-sentence-level-cot-with-search-v2


In [15]:
# remove all rows with NaN values
df = df.dropna()
print(len(df))

1136


In [16]:
drop_indices = []
for i, row in df.iterrows():
    citations = json.loads(row["citations"])
    for c in citations:
        if 1 in c["paragraph_numbers"] or 2 in c["paragraph_numbers"] or 3 in c["paragraph_numbers"] or 4 in c["paragraph_numbers"] or 5 in c["paragraph_numbers"]:
            drop_indices.append(i)
            break

df = df.drop(drop_indices)
print(len(df))
df.to_csv('data/echr_qa_dataset_v4_with_citations.csv', index=False)

1130


In [17]:
# let's check for very similar questions
from langchain_openai import OpenAIEmbeddings
questions_with_embeddings_df = df.copy()

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

question_embeddings = embeddings.embed_documents(questions_with_embeddings_df["question"].tolist())

OPENAI_EMBEDDINGS = "openai_embeddings"
questions_with_embeddings_df[OPENAI_EMBEDDINGS] = question_embeddings


questions_with_embeddings_df.to_csv('data/echr_qa_dataset_v3_with_embeddings.csv', index=False)

In [18]:
from scipy.spatial.distance import cosine

def get_top_n_similarities(
    q_embedding, n: int = 5
):
    df_copy = questions_with_embeddings_df.copy()
    df_copy["similarity"] = df_copy[OPENAI_EMBEDDINGS].apply(
        lambda x: 1 - cosine(x, q_embedding)
    )

    return df_copy.nlargest(n, "similarity")

In [20]:
drop_indices = []
do_not_drop = []
for i, row in questions_with_embeddings_df.iterrows():
    if row["question"] in do_not_drop:
        continue
    print("Q:", row["question"])
    top_similar = get_top_n_similarities(row[OPENAI_EMBEDDINGS], n=2)
    top_similar_question = top_similar["question"].values[1]
    top_similar_similarity = top_similar["similarity"].values[1]
    print("MS:", top_similar_question)
    print("Sim:", top_similar_similarity)
    do_not_drop.append(top_similar_question)
    if top_similar_similarity > 0.85:
        drop_indices.append(i)
    print("\n\n")

print("Dropping:", len(drop_indices))

Q: How does the Court determine whether a surveillance measure falls within the scope of Article 8 of the ECHR, and what conditions must be met for such interference to be considered "necessary in a democratic society"?
MS: How does the Court determine whether a measure of secret surveillance is strictly necessary for safeguarding democratic institutions and obtaining vital intelligence in an individual operation, and what conditions must be met for an applicant to claim to be the victim of a violation of Article 8 without having to prove that secret surveillance measures had been applied to them?
Sim: 0.8298632217963157



Q: How does the margin of appreciation apply in the context of police powers to stop, search, and question persons suspected of terrorist acts under Schedule 7 of the counter-terrorism legislation?
MS: How does the margin of appreciation apply in cases where the police use force to arrest assembly participants who are not engaged in any acts of violence?
Sim: 0.6890

In [262]:
df = df.drop(drop_indices)
print(len(df))
df.to_csv('data/echr_qa_dataset_v4_with_citations.csv', index=False)

939


In [25]:
# cases map to df
# ! I should collect case name and year for each case
as_list = []

for case_id, paragraphs in cases_map.items():
    if len(paragraphs) < 6:
        print(f"Case id: {case_id} has less than 3 paragraphs")
    for paragraph_number, paragraph_text in paragraphs.items():
        as_list.append({'case_id': case_id, 'paragraph_number': paragraph_number, 'paragraph_text': paragraph_text})
    
cases_map_df = pd.DataFrame(as_list)
cases_map_df.to_csv('data/cases.csv', index=False)
print(len(cases_map_df))

Case id: 001-83454 has less than 3 paragraphs
Case id: 001-219777 has less than 3 paragraphs
Case id: 001-223291 has less than 3 paragraphs
Case id: 001-222653 has less than 3 paragraphs
Case id: 001-80416 has less than 3 paragraphs
Case id: 001-219043 has less than 3 paragraphs
Case id: 001-214042 has less than 3 paragraphs
Case id: 001-83396 has less than 3 paragraphs
Case id: 001-90278 has less than 3 paragraphs
Case id: 001-216170 has less than 3 paragraphs
Case id: 001-93077 has less than 3 paragraphs
Case id: 001-92349 has less than 3 paragraphs
Case id: 001-223376 has less than 3 paragraphs
Case id: 001-215349 has less than 3 paragraphs
Case id: 001-222655 has less than 3 paragraphs
Case id: 001-216473 has less than 3 paragraphs
Case id: 001-215352 has less than 3 paragraphs
Case id: 001-219560 has less than 3 paragraphs
Case id: 001-88899 has less than 3 paragraphs
Case id: 001-222098 has less than 3 paragraphs
Case id: 001-224983 has less than 3 paragraphs
Case id: 001-220365 

In [36]:
final_df = pd.read_csv('data/echr_qa_dataset_with_citations_final.csv')

for i, row in final_df.iterrows():
    citations = row['citations']
    citations = json.loads(citations)
    citations = [Citation(**c) for c in citations]

print(len(final_df))

questions_with_cases = 0

for i, row in final_df.iterrows():
    question = row['question']
    if "v." in question:
        questions_with_cases += 1

print(questions_with_cases)

1000
305


In [232]:
counter = 0
df = pd.read_csv('echr_qa_dataset_final.csv')
print(len(df))
for i, row in df.iterrows():
    answer = row['answer']
    sentences = get_sentences_spacy(answer)
    for sentence in sentences:
        find_sentence_df = sentences_df[sentences_df['sentence'] == sentence]

        if len(find_sentence_df) == 0:
            counter += 1
            continue

print(counter)

1040
170
