## Dataset Filtering

This notebook is used to apply additional filtering criteria to our ECHR-QA Dataset.
Each ECHR-QA pair must:
- Have at least one citation
- All passages of the citation must be retrievable (and will be saved with the citation)
- Questions must not mention a case

Final Result:
- Each ECHR-QA pair entry consist of (question, answer, citations) where the citations are all cited cases in the answer.

General strategy:
- We parse the case law guides on a sentence level and save the citations for each sentence (guide_id, paragraph, sentence, citations)
- We attempt to retrieve the citations for each sentence so we know which sentences are usable
- We attempt to create a QA pair based on the entire case law guides 
- We filter out generated QA pairs that do not meet all of the criteria mentioned above

In [None]:
import pandas as pd

sentences_df = pd.read_csv('data/sentences_with_citations_usable.csv')
sentences_df

In [None]:
from pydantic import BaseModel


class Citation(BaseModel):
    case_name: str
    case_id: str
    paragraph_numbers: list[int]
    paragraphs_map: dict[int, str] | None = {}

def clean_case_id(case_id: str):
    if "(F)" in case_id:
        raise ValueError(f"Case id contains (F): {case_id}")
    if "unknown" in case_id.lower():
        raise ValueError(f"Case id unknown: {case_id}")
    if "(" in case_id:
        case_id = case_id.split("(")[0]
    return case_id

def merge_citations(list_of_citations: list[Citation]):
    merged = {clean_case_id(c.case_id): Citation(case_name=c.case_name, case_id=clean_case_id(c.case_id), paragraph_numbers=[]) for c in list_of_citations}
    for c in list_of_citations:
        merged[clean_case_id(c.case_id)].paragraph_numbers.extend(c.paragraph_numbers)
        merged[clean_case_id(c.case_id)].paragraph_numbers = list(set(merged[clean_case_id(c.case_id)].paragraph_numbers))
        merged[clean_case_id(c.case_id)].paragraph_numbers.sort()
        
    return list(merged.values())

In [None]:
# test merge citations
import json


list_of_citations_1 = [{"case_name": "ila\u015fcu and others v. moldova and russia", "case_id": "001-61886", "paragraph_numbers": [348, 349, 350, 351, 352]}, {"case_name": "ivan\u0163oc and others v. moldova and russia", "case_id": "001-107480", "paragraph_numbers": [111]}, {"case_name": "Catan and Others v. the Republic of Moldova and Russia", "case_id": "001-114082", "paragraph_numbers": [148]}]
list_of_citations_1 = [Citation(**c) for c in list_of_citations_1]
list_of_citations_2 = [{"case_name": "mamasakhlisi and others v. georgia and russia", "case_id": "001-223361", "paragraph_numbers": [398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410]}]
list_of_citations_2 = [Citation(**c) for c in list_of_citations_2]
list_of_citations = list_of_citations_1 + list_of_citations_2
merged = merge_citations(list_of_citations)
print(json.dumps([m.model_dump() for m in merged], indent=2))

In [None]:
import spacy


nlp = spacy.load("en_core_web_trf")


def get_sentences_spacy(text: str):
    doc = nlp(text)
    return [sentence.text for sentence in doc.sents]

In [None]:
echr_qa_dataset_df = pd.read_csv('data/echr_qa_dataset_v4.csv')

In [None]:
import re

def get_paragraph_number(text):
  pattern = r"^(\d+)\."
  match = re.match(pattern, text)
  return int(match.group(1)) if match else None

def get_paragraphs(data: list[dict]):
    paragraphs = {}
    def get_paragraphs_rec(data):
        paragraph_number = get_paragraph_number(data["content"])
        if paragraph_number and paragraph_number not in paragraphs:
            paragraph = data["content"]
            paragraphs[paragraph_number] = paragraph
        for e in data["elements"]:
            res = get_paragraphs_rec(e)
            if res:
                return res
        return None
    for d in data:
        get_paragraphs_rec(d)

    return paragraphs

import json
import sqlite3


db_path = "data/echr_2_0_0.db"
conn = sqlite3.connect(db_path)
c = conn.cursor()

cases = c.execute("SELECT * FROM 'case'")
cases = cases.fetchall()

cases_map = {} # maps case_id to paragraph_number to paragraph_text

for case in cases:
    case_id = case[0]
    data = case[-1]
    data = json.loads(data)
    paragraphs = get_paragraphs(data)
    # print(json.dumps(paragraphs, indent=4))
    cases_map[case_id] = paragraphs

In [None]:
import requests

from bs4 import BeautifulSoup

def available_paragraphs(text: str):
    i = 1
    while f"\n{i}" in text:
        i += 1
    return i - 1

def get_paragraphs_for_case_id(case_id: str):
    url = f"https://hudoc.echr.coe.int/app/conversion/docx/html/body?library=ECHR&id={case_id}"
    res = requests.get(url)
    data = res.text

    soup = BeautifulSoup(data, "html.parser")

    text = soup.get_text(separator="\n")
    n = available_paragraphs(text)

    paragraphs = {}
    for i in range(1, n):
        _, _, after = text.partition(f"\n{i}")
        paragraph, _, text = after.partition(f"\n{i+1}")
        text = f"\n{i+1}" + text
        paragraphs[i] = re.sub(r'\s+', ' ', paragraph).strip()
    paragraphs[n] = text[0:600]
    return paragraphs

print(json.dumps(get_paragraphs_for_case_id("001-98238"), indent=4))

In [None]:
import copy

def retrieve_citations(citations: list[Citation]):
    citations = copy.deepcopy(citations)
    error = False
    
    for citation in citations:
        case_id = citation.case_id

        if "(F)" in case_id or "unknown" in case_id.lower() or "UNIDENTIFIABLE" in case_id:
            error = True
            continue

        case_id = clean_case_id(case_id)

        retrieved_case = False
        if case_id not in cases_map:
            # we attempt to retrieve the paragraphs from the web and update our map
            retrieved_case = True
            paragraphs = get_paragraphs_for_case_id(case_id)
            if not paragraphs:
                print(f"Failed retrieving paragraphs: https://hudoc.echr.coe.int/app/conversion/docx/html/body?library=ECHR&id={case_id}")
                error = True
            
            # we save it anyway as we don't want to keep trying to retrieve it
            cases_map[case_id] = paragraphs
        else: 
            paragraphs = cases_map[case_id]
        
        paragraphs_map = {}
        for paragraph_number in citation.paragraph_numbers:
            if paragraph_number not in paragraphs:
                print(f'Paragraph {paragraph_number} not found for{" (retrieved)" if retrieved_case else ""}: https://hudoc.echr.coe.int/eng?i={case_id}')
                error = True
                break
            paragraphs_map[paragraph_number] = paragraphs[paragraph_number]

        citation.paragraphs_map = paragraphs_map
    return citations, error

In [None]:
# ! only run if needed !

usable = []

for i, row in sentences_df.iterrows():
    citations = json.loads(row["citations"])
    citations = [Citation(**c) for c in citations]
    citations, error = retrieve_citations(citations)
    if error:
        usable.append(False)
    else:
        usable.append(True)

sentences_df["usable"] = usable

sentences_df    

In [None]:
# count the true and false values in the usable column
print(sentences_df["usable"].value_counts())

In [None]:
def get_citations(sentences: list[str]):
    # only returns citations if they are valid, else none
    citations = []
    for sentence in sentences:
        find_sentence_df = sentences_df[sentences_df['sentence'] == sentence]

        if len(find_sentence_df) == 0:
            print("Sentence not found in guide, removing pair...")
            return None
            
        sentence_citations = json.loads(find_sentence_df.iloc[0]["citations"])
        sentence_citations = [Citation(**c) for c in sentence_citations]

        for sc in sentence_citations:
            citations.append(sc)

    if find_sentence_df["usable"].values[0] == False:
        print("Invalid: Sentence not usable")
        return None
    
    merged_citations = merge_citations(citations)
    
    if not merged_citations:
        print("Invalid: No citations found")
        return None

    citations_without_paragraphs = [c for c in merged_citations if len(c.paragraph_numbers) == 0]
    if citations_without_paragraphs:
        print("Invalid: Found citation without paragraph")
        return None

    
    merged_citations, error = retrieve_citations(merged_citations)
    if error:
        print("Invalid: Error retrieving citations")
        return None
    
    return merged_citations

In [None]:
df = pd.DataFrame(columns=["question", "answer", "guide", "paragraphs", "citations", "prompt_id"])

for i, row in echr_qa_dataset_df.iterrows():
    answer = row['answer']
    if answer[0] == "[":
        sentences = json.loads(answer)
    else:
        print("Already processed")
        df = df._append({
            "question": row['question'],
            "answer": row['answer'],
            "guide": row['guide'],
            "paragraphs": row['paragraphs'],
            "citations": json.dumps(json.loads(row['citations']), indent=4),
            "prompt_id": row['prompt_id']
        }, ignore_index=True)
        continue
    
    citations = []

    if "v." in row['question'] or ".?" in row['question']:
        print("Invalid: Question contains v. or is not a question (.?)")
        continue

    try:
        merged_citations = get_citations(sentences)
        if not merged_citations:
            continue

        print("Found valid QA pair")
        df = df._append({
            "question": row['question'],
            "answer": " ".join(sentences),
            "guide": row['guide'],
            "paragraphs": row['paragraphs'],
            "citations": json.dumps([c.model_dump() for c in merged_citations], indent=4),
            "prompt_id": row['prompt_id']
        }, ignore_index=True)
    except Exception as e:
        print(f"Invalid: {e}")

print(len(df))

In [None]:
df.to_csv('data/echr_qa_dataset_v4_with_citations.csv', index=False)

In [None]:
df = pd.read_csv('data/echr_qa_dataset_v4_with_citations.csv')
for i, row in df.iterrows():
    json.loads(row['citations'])


print(df.at[5, 'question'])

In [None]:
MANUAL_DROP = [5]

In [None]:
# remove all rows with NaN values
df = df.dropna()
print(len(df))

In [None]:
drop_indices = []
for i, row in df.iterrows():
    citations = json.loads(row["citations"])
    for c in citations:
        if 1 in c["paragraph_numbers"] or 2 in c["paragraph_numbers"] or 3 in c["paragraph_numbers"] or 4 in c["paragraph_numbers"] or 5 in c["paragraph_numbers"]:
            drop_indices.append(i)
            break

df = df.drop(drop_indices)
print(len(df))
df.to_csv('data/echr_qa_dataset_v4_with_citations.csv', index=False)

In [None]:
# let's check for very similar questions
from langchain_openai import OpenAIEmbeddings
questions_with_embeddings_df = df.copy()

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

question_embeddings = embeddings.embed_documents(questions_with_embeddings_df["question"].tolist())

OPENAI_EMBEDDINGS = "openai_embeddings"
questions_with_embeddings_df[OPENAI_EMBEDDINGS] = question_embeddings


questions_with_embeddings_df.to_csv('data/echr_qa_dataset_v3_with_embeddings.csv', index=False)

In [None]:
from scipy.spatial.distance import cosine

def get_top_n_similarities(
    q_embedding, n: int = 5
):
    df_copy = questions_with_embeddings_df.copy()
    df_copy["similarity"] = df_copy[OPENAI_EMBEDDINGS].apply(
        lambda x: 1 - cosine(x, q_embedding)
    )

    return df_copy.nlargest(n, "similarity")

In [None]:
drop_indices = []
do_not_drop = []
for i, row in questions_with_embeddings_df.iterrows():
    if row["question"] in do_not_drop:
        continue
    print("Q:", row["question"])
    top_similar = get_top_n_similarities(row[OPENAI_EMBEDDINGS], n=2)
    top_similar_question = top_similar["question"].values[1]
    top_similar_similarity = top_similar["similarity"].values[1]
    print("MS:", top_similar_question)
    print("Sim:", top_similar_similarity)
    do_not_drop.append(top_similar_question)
    if top_similar_similarity > 0.85:
        drop_indices.append(i)
    print("\n\n")

print("Dropping:", len(drop_indices))

In [None]:
df = df.drop(drop_indices)
print(len(df))
df.to_csv('data/echr_qa_dataset_v4_with_citations.csv', index=False)

In [None]:
# cases map to df
# ! I should collect case name and year for each case
as_list = []

for case_id, paragraphs in cases_map.items():
    if len(paragraphs) < 6:
        print(f"Case id: {case_id} has less than 3 paragraphs")
    for paragraph_number, paragraph_text in paragraphs.items():
        as_list.append({'case_id': case_id, 'paragraph_number': paragraph_number, 'paragraph_text': paragraph_text})
    
cases_map_df = pd.DataFrame(as_list)
cases_map_df.to_csv('data/cases.csv', index=False)
print(len(cases_map_df))

In [None]:
final_df = pd.read_csv('data/echr_qa_dataset_with_citations_final.csv')

for i, row in final_df.iterrows():
    citations = row['citations']
    citations = json.loads(citations)
    citations = [Citation(**c) for c in citations]

print(len(final_df))

questions_with_cases = 0

for i, row in final_df.iterrows():
    question = row['question']
    if "v." in question:
        questions_with_cases += 1

print(questions_with_cases)

In [None]:
counter = 0
df = pd.read_csv('echr_qa_dataset_final.csv')
print(len(df))
for i, row in df.iterrows():
    answer = row['answer']
    sentences = get_sentences_spacy(answer)
    for sentence in sentences:
        find_sentence_df = sentences_df[sentences_df['sentence'] == sentence]

        if len(find_sentence_df) == 0:
            counter += 1
            continue

print(counter)