In [7]:
import json
import pandas as pd

GUIDE_ID = "guide_art_6_civil_eng"

df = pd.read_csv('data/echr_case_law_guides_with_possible_eng_citations.csv')
df = df[df['guide_id'] == GUIDE_ID]
df

Unnamed: 0,paragraph,guide_id,paragraph_id,possible_citations,possible_eng_citations
719,The concept of “civil rights and obligations” ...,guide_art_6_civil_eng,1,"{""001-216400"": ""CASE OF GRZ\u0118DA v. POLAND""...","{""001-216400"": ""CASE OF GRZ\u0118DA v. POLAND""..."
720,"However, the principle that the autonomous con...",guide_art_6_civil_eng,2,"{""001-216400"": ""CASE OF GRZ\u0118DA v. POLAND""...","{""001-216400"": ""CASE OF GRZ\u0118DA v. POLAND""..."
721,"The judgment in Grzęda v. Poland [GC], 2022, r...",guide_art_6_civil_eng,3,"{""001-216400"": ""CASE OF GRZ\u0118DA v. POLAND""...","{""001-216400"": ""CASE OF GRZ\u0118DA v. POLAND""..."
722,"The two aspects, civil and criminal, of Articl...",guide_art_6_civil_eng,4,"{""001-177070"": ""CASE OF K\u00c1ROLY NAGY v. HU...","{""001-177070"": ""CASE OF K\u00c1ROLY NAGY v. HU..."
723,The word “dispute” must be given a substantive...,guide_art_6_civil_eng,5,"{""001-177070"": ""CASE OF K\u00c1ROLY NAGY v. HU...","{""001-177070"": ""CASE OF K\u00c1ROLY NAGY v. HU..."
...,...,...,...,...,...
822,While Article 6 § 1 offers a procedural safegu...,guide_art_6_civil_eng,104,"{""001-186438"": ""CASE OF KO\u017dEMIAKINA v. LI...","{""001-186438"": ""CASE OF KO\u017dEMIAKINA v. LI..."
823,The Court has emphasised the special nature of...,guide_art_6_civil_eng,105,"{""001-186438"": ""CASE OF KO\u017dEMIAKINA v. LI...","{""001-186438"": ""CASE OF KO\u017dEMIAKINA v. LI..."
824,"In López Ribalda and Others v. Spain [GC], 201...",guide_art_6_civil_eng,106,"{""001-186438"": ""CASE OF KO\u017dEMIAKINA v. LI...","{""001-186438"": ""CASE OF KO\u017dEMIAKINA v. LI..."
825,The Court has dealt with cases in which the ap...,guide_art_6_civil_eng,107,"{""001-186438"": ""CASE OF KO\u017dEMIAKINA v. LI...","{""001-186438"": ""CASE OF KO\u017dEMIAKINA v. LI..."


In [8]:
possible_citations = {}
for i, row in df.iterrows():
    pcs = json.loads(row["possible_eng_citations"])
    possible_citations.update(pcs)

possible_citations = {k: v for k, v in possible_citations.items() if "v." in v}

print(json.dumps(possible_citations, indent=4))

ValueError: too many values to unpack (expected 2)

In [6]:
from citation_extraction.src.citation_extractor import CitationNotIdentifiableException, Sentence, extract_citations
last_cited_case = ""
last_cited_case_id = ""

manual_mappings = {}
# open json file with manual mappings
with open("data/manual_mappings.json", "r") as f:
    manual_mappings = json.load(f)

sentences = []
citations = []
guide_ids = []
paragraphs = []


for i, row in df.iterrows():
    sentences_with_citations: list[Sentence] = []
    possible_citations = json.loads(row['possible_eng_citations'])
    sentences_with_citations, last_cited_case, last_cited_case_id = extract_citations(row['paragraph'], possible_citations, last_cited_case, last_cited_case_id, manual_mappings)
    sentences.extend([s.sentence for s in sentences_with_citations])
    citations.extend([json.dumps([c.model_dump() for c in s.citations], indent=4) for s in sentences_with_citations])
    guide_ids.extend([row['guide_id'] for _ in sentences_with_citations])
    paragraphs.extend([row['paragraph_id'] for _ in sentences_with_citations])


In [3]:
print(len(sentences))
print(len(citations))
print(len(guide_ids))
print(len(paragraphs))

sentences_df = pd.DataFrame({'sentence': sentences, 'citations': citations, 'guide_id': guide_ids, 'paragraph_id': paragraphs})
sentences_df.to_csv('data/extracted_citations.csv', index=False)

120
120
120
120


In [4]:
from citation_extraction.src.citation_extractor import Citation

def clean_case_id(case_id: str):
    if "(F)" in case_id:
        raise ValueError("Case id contains (F)")
    if "(" in case_id:
        case_id = case_id.split("(")[0]
    return case_id

def merge_citations(list_of_citations: list[Citation]):
    merged = {clean_case_id(c.case_id): Citation(case_name=c.case_name, best_match=c.best_match, case_id=clean_case_id(c.case_id), paragraph_numbers=[]) for c in list_of_citations}
    for c in list_of_citations:
        merged[clean_case_id(c.case_id)].paragraph_numbers.extend(c.paragraph_numbers)
        merged[clean_case_id(c.case_id)].paragraph_numbers = list(set(merged[clean_case_id(c.case_id)].paragraph_numbers))
        
    return list(merged.values())

In [5]:
import spacy


nlp = spacy.load("en_core_web_trf")


def get_sentences_spacy(text: str):
    doc = nlp(text)
    return [sentence.text for sentence in doc.sents]

In [6]:
echr_qa_dataset_df = pd.read_csv('data/echr_qa_dataset.csv')
echr_qa_dataset_df = echr_qa_dataset_df[echr_qa_dataset_df['guide'] == GUIDE_ID]

print(len(echr_qa_dataset_df))

37


In [7]:
all_citations = []
all_questions = []
all_answers = []
all_guides = []
all_paragraphs = []

for i, row in echr_qa_dataset_df.iterrows():
    sentences = get_sentences_spacy(row['answer'])
    citations = []
    try:
        for sentence in sentences:
            find_sentence_df = sentences_df[sentences_df['sentence'] == sentence]

            if "v." in sentence and len(find_sentence_df) == 0:
                raise ValueError(f"Sentence not found: {sentence}")

            if len(find_sentence_df) > 0:
                sentence_citations = [json.loads(c) for c in find_sentence_df['citations']][0]
                sentence_citations = [Citation(**c) for c in sentence_citations]
                for sc in sentence_citations:
                    citations.append(sc)
        
        merged_citations = merge_citations(citations)
        has_citation_without_paragraph = False
        for c in merged_citations:
            if len(c.paragraph_numbers) == 0:
                has_citation_without_paragraph = True
                break
        if not has_citation_without_paragraph or not merged_citations:
            all_citations.append(merged_citations)
            all_questions.append(row['question'])
            all_answers.append(row['answer'])
            all_guides.append(row['guide'])
            all_paragraphs.append(row['paragraphs'])
    except Exception as e:
        print(e)



Case id contains (F)
Case id contains (F)
Case id contains (F)
Case id contains (F)
Sentence not found: In the Court’s view the work required did not go beyond what is “ordinary” in this context since it was calculated to assist him in reintegrating himself into society and had as its legal basis provisions which find an equivalent in certain other member States of the Council of Europe (ibid.; Stummer v. Austria [GC], 2011, § 121; De Wilde, Ooms and Versyp v. Belgium, 1971, § 90). In a case where the applicant complained about the obligation on prisoners to perform work in prison after they had reached retirement age, the Court, having regard to the aim of the work imposed, its nature, its extent and the manner in which it was to be performed as well as noting the absence of consensus among the Council of Europe member States on the issue
Case id contains (F)
Case id contains (F)
Case id contains (F)
Case id contains (F)
Sentence not found: In other words, the Court is not concerned w

In [10]:
print(len(all_citations))

14


In [9]:
all_citations = [json.dumps([c.model_dump() for c in cs], indent=4) for cs in all_citations]

qa_pairs_with_citations_df = pd.DataFrame({'question': all_questions, 'answer': all_answers, 'guide': all_guides, 'paragraphs': all_paragraphs, 'citations': all_citations})
qa_pairs_with_citations_df.to_csv('data/echr_qa_dataset_with_citations.csv', index=False)