In [None]:
import json
import requests
from string import Template

cases_cache = {}

def get_metadata_for_case_id(case_id: str):
    if case_id in cases_cache:
        return cases_cache[case_id]["docname"]
    
    url_template = Template('https://hudoc.echr.coe.int/app/query/results?query=((itemid%3A\"$case_id\"))&select=sharepointid,rank,echrranking,languagenumber,itemid,docname,doctype,application,appno,conclusion,importance,originatingbody,typedescription,kpdate,kpdateastext,documentcollectionid,documentcollectionid2,languageisocode,extractedappno,isplaceholder,doctypebranch,respondent,advopidentifier,advopstatus,ecli,appnoparts,sclappnos,ECHRConcepts&sort=&start=0&length=20&rankingModelId=11111111-0000-0000-0000-000000000000')
    url = url_template.substitute(case_id=case_id)
    res = requests.get(url)
    data = res.json()
    return data["results"][0]["columns"]


metadata = get_metadata_for_case_id("001-222891")
print(json.dumps(metadata, indent=2))

In [None]:
def find_eng_version(metadata):
    app_numbers = metadata["appno"].split(";")
    fst = app_numbers[0].split("/")[0]
    snd = app_numbers[0].split("/")[1]

    url = f"https://hudoc.echr.coe.int/app/query/results?query=(contentsitename=ECHR)%20AND%20((appno.keyword%3A%22{fst}%2F{snd}%22)%20OR%20((advopidentifier.keyword%3A%22{fst}%2F{snd}%22)%20AND%20doctype%3AADV*)%20OR%20((extractedappno.keyword%3A%22{fst}%2F{snd}%22)%20AND%20doctype%3APR))&select=itemid,appno,extractedappno,documentcollectionid,kpdate,languageisocode,isplaceholder,advopidentifier&sort=&start=0&length=500"
    res = requests.get(url)
    data = res.json()
    data = [item for item in data["results"] if "languageisocode" in item["columns"] and item["columns"]["languageisocode"] == "ENG" and "itemid" in item["columns"] and "001-" in item["columns"]["itemid"]]
    try:
        print("Data:\n", json.dumps(data, indent=2))
        print("App nr", fst, snd)
        id = data[0]["columns"]["itemid"]
        new_metadata = get_metadata_for_case_id(id)
        doc_name = new_metadata["docname"]
        print(f"English version found: {doc_name}")
        print(f"French version: {metadata['docname']}")
        return id, doc_name
    except:
        return "NaN", "NaN"

In [None]:
import json
import pandas as pd

from utils.fetch_pdf import fetch_pdf_content
import PyPDF2

def get_associated_cases_df(guide_url: str):
    content = fetch_pdf_content(guide_url)
    reader = PyPDF2.PdfReader(content)
    key = '/Annots'
    uri = '/URI'
    ank = '/A'
    df = pd.DataFrame(columns=["page_number", "citations", "page_text"])

    for i in range(0,len(reader.pages)):
        page = reader.pages[i]
        pageObject = page.get_object()
        pageText = page.extract_text()
        page_citations = {}

        if key in pageObject.keys():
            ann = pageObject[key]
            for a in ann:
                u = a.get_object()
                try:
                    if uri in u[ank].keys():
                        url = u[ank][uri]
                        case_id = url.split('=')[-1]
                        metadata = get_metadata_for_case_id(case_id)
                        docname = metadata["docname"]
                        if metadata["languageisocode"] == "FRE":
                            find_eng_version(metadata)
                        page_citations[case_id] = docname
                        print(f"Page {i}: {docname}")
                        print()
                except:
                    pass
        df = df._append({"page_number": i, "citations": json.dumps(page_citations), "page_text": pageText}, ignore_index=True)

    return df

In [None]:
associated_cases_df = get_associated_cases_df("https://ks.echr.coe.int/documents/d/echr-ks/guide_art_1_eng")
associated_cases_df

In [None]:
import re
import pandas as pd

def simplify_text_for_entailment(text: str):
    # remove all spaces, newlines, tabs
    cleanedText = re.sub(r'\s+', ' ', text)
    # make all characters lowercase
    cleanedText = cleanedText.lower()
    # remove all non-alphanumeric characters
    cleanedText = re.sub(r'\W', '', cleanedText)

    return cleanedText

def get_possible_citations_for_paragraphs_df(paragraphs_df: pd.DataFrame, associated_cases_df: pd.DataFrame):
    pdf = paragraphs_df.copy()
    ac_df = associated_cases_df.copy()
    for i, paragraph in pdf.iterrows():
        par = paragraph["paragraph"]
        cleaned_par_start = simplify_text_for_entailment(par[0:200])
        cleaned_par_end = simplify_text_for_entailment(par[-100:])

        possible_citations = {}

        for j, page in ac_df.iterrows():
            page_text = page["page_text"]
            cleaned_page_text = simplify_text_for_entailment(page_text)

            if cleaned_par_start in cleaned_page_text:
                possible_citations.update(json.loads(page["citations"]))
            if cleaned_par_end in cleaned_page_text:
                possible_citations.update(json.loads(page["citations"]))

        pdf.at[i, "possible_citations"] = json.dumps(possible_citations)
    return pdf

In [None]:
paragraphs_df = pd.read_csv("data/echr_case_law_guides.csv")
paragraphs_df = paragraphs_df[paragraphs_df["guide_id"] == "guide_art_1_eng"]

combined_df = get_possible_citations_for_paragraphs_df(paragraphs_df, associated_cases_df)
combined_df

In [None]:
from guide_parser import GuideParser

parsers = {
    "guide_art_1_eng": GuideParser(guide_id="guide_art_1_eng", remove_patterns=["Concepts of “jurisdiction” and imputability"]),
    "guide_art_2_eng": GuideParser(guide_id="guide_art_2_eng"),
    "guide_art_3_eng": GuideParser(guide_id="guide_art_3_eng", starting_string="1.  The Court’s approach to the interpretation"),
    "guide_art_4_eng": GuideParser(guide_id="guide_art_4_eng"),
    "guide_art_5_eng": GuideParser(guide_id="guide_art_5_eng"),
    "guide_art_6_civil_eng": GuideParser(guide_id="guide_art_6_civil_eng"),
    "guide_art_6_criminal_eng": GuideParser(guide_id="guide_art_6_criminal_eng"),
    "guide_art_7_eng": GuideParser(guide_id="guide_art_7_eng", starting_string="1.  The guarantee enshrined in Article 7"),
    "guide_art_8_eng": GuideParser(guide_id="guide_art_8_eng"),
    "guide_art_9_eng": GuideParser(guide_id="guide_art_9_eng"),
    "guide_art_10_eng": GuideParser(guide_id="guide_art_10_eng"),
    "guide_art_11_eng": GuideParser(guide_id="guide_art_11_eng"),
    "guide_art_12_eng": GuideParser(guide_id="guide_art_12_eng", starting_string="1.  Article 12 of the Convention guarantees"),
    "guide_art_13_eng": GuideParser(guide_id="guide_art_13_eng"),
    "guide_art_14_art_1_protocol_12_eng": GuideParser(guide_id="guide_art_14_art_1_protocol_12_eng"),
    "guide_art_15_eng": GuideParser(guide_id="guide_art_15_eng"),
    "guide_art_17_eng": GuideParser(guide_id="guide_art_17_eng"),
    "guide_art_18_eng": GuideParser(guide_id="guide_art_18_eng"),
    "Admissibility_guide_ENG": GuideParser(guide_id="Admissibility_guide_ENG", url="https://www.echr.coe.int/documents/d/echr/", starting_string="1.  The  system  of  protection  of  fundamental  rights"),
    "guide_art_46_eng": GuideParser(guide_id="guide_art_46_eng", starting_string="1.  One of the most significant features of the Convention"),
    "guide_art_1_protocol_1_eng": GuideParser(guide_id="guide_art_1_protocol_1_eng"),
    "guide_art_2_protocol_1_eng": GuideParser(guide_id="guide_art_2_protocol_1_eng"),
    "guide_art_3_protocol_1_eng": GuideParser(guide_id="guide_art_3_protocol_1_eng"),
    "guide_art_2_protocol_4_eng": GuideParser(guide_id="guide_art_2_protocol_4_eng", starting_string="1.  Article 2 of Protocol No. 4 guarantees three"),
    "guide_art_3_protocol_4_eng": GuideParser(guide_id="guide_art_3_protocol_4_eng", starting_string="1.  Article 3, Protocol No. 4 guarantees two"),
    "guide_art_4_protocol_4_eng": GuideParser(guide_id="guide_art_4_protocol_4_eng"),
    "guide_art_1_protocol_7_eng": GuideParser(guide_id="guide_art_1_protocol_7_eng", starting_string="1.  Protocol No. 7 to the European Convention"),
    "guide_art_2_protocol_7_eng": GuideParser(guide_id="guide_art_2_protocol_7_eng", starting_string="1.  Article 2  of  Protocol  No.  7  complements  the  guarantees"),
    "guide_art_4_protocol_7_eng": GuideParser(guide_id="guide_art_4_protocol_7_eng", starting_string="1.  Protocol No. 7 to the Convention"),
    "guide_data_protection_eng": GuideParser(guide_id="guide_data_protection_eng", starting_string="1.  Technological progress has led to a quantum"),
    "guide_environment_eng": GuideParser(guide_id="guide_environment_eng", starting_string="1.  The positive obligation on States to take"),
    "guide_immigration_eng": GuideParser(guide_id="guide_immigration_eng", starting_string="1.  The present  document is"),
    "guide_mass_protests_eng": GuideParser(guide_id="guide_mass_protests_eng", starting_string="1.  The  present  Guide  analyses  the  Court’s"),
    "guide_prisoners_rights_eng": GuideParser(guide_id="guide_prisoners_rights_eng", starting_string="1.  The Court is frequently called upon to"),
    "guide_lgbti_rights_eng": GuideParser(guide_id="guide_lgbti_rights_eng", starting_string="1.  The  Convention  is  a  living  instrument  which"),
    "guide_social_rights_eng": GuideParser(guide_id="guide_social_rights_eng", starting_string="1.  The Convention as adopted in 1950 reflected"),
    "guide_terrorism_eng": GuideParser(guide_id="guide_terrorism_eng", starting_string="1.   Since its first ever judgment Lawless v. Ireland")
}

In [None]:
paragraphs_df = pd.read_csv("data/echr_case_law_guides.csv")
df = pd.read_csv("echr_case_law_guides_with_possible_citations.csv")

for key, parser in parsers.items():
    # check if key already exists in df
    if key in df["guide_id"].values:
        print(f"Already completed {key}")
        continue

    pdf_copy = paragraphs_df[paragraphs_df["guide_id"] == key]
    associated_cases_df = get_associated_cases_df(parser.url)
    combined_df = get_possible_citations_for_paragraphs_df(pdf_copy, associated_cases_df)
    
    df = pd.concat([df, combined_df], ignore_index=True)
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]
    print("Completed:", key)

    df.to_csv("data/echr_case_law_guides_with_possible_citations.csv", index=False)
df

In [None]:
df = pd.read_csv("data/echr_case_law_guides_with_possible_citations.csv")
df

In [None]:
citations = {}

for i, row in df.iterrows():
    possible_citations = row["possible_citations"]
    possible_citations = json.loads(possible_citations)

    for case_id, docname in possible_citations.items():
        citations[case_id] = docname

len(citations)

In [None]:
french_to_english = []

for citation in citations.keys():
    metadata = get_metadata_for_case_id(citation)
    if metadata["languageisocode"] == "FRE":
        try:
            eng_id, doc_name = find_eng_version(metadata)
            french_to_english.append({"fre_id": citation, "eng_id": eng_id, "fre_docname": citations[citation], "eng_docname": doc_name})
        except:
            french_to_english.append({"fre_id": citation, "eng_id": "NaN", "fre_docname": citations[citation], "eng_docname": "NaN"})

print(len(french_to_english))

In [None]:
print(len(french_to_english))

# count how many nan values are in the eng_id column
nan_values = len([item for item in french_to_english if item["eng_id"] == "NaN"])
print(nan_values)

# print rows in which the eng docname and the french docname are very different lengths
for item in french_to_english:
    english_name = item["eng_docname"]
    english_name = english_name.replace("CASE OF ", "")
    french_name = item["fre_docname"]
    french_name = french_name.replace("AFFAIRE ", "")
    if len(french_name) - len(english_name) > 6 and item["eng_id"] != "NaN":
        print(f"French: {french_name}")
        print(f"English: {english_name}")
        print()

In [None]:
french_to_english_df = pd.DataFrame(french_to_english)
french_to_english_df.to_csv("data/french_to_english_citations.csv", index=False)
french_to_english_df

In [None]:
def get_mapping(id: str):
    mapping = french_to_english_df[french_to_english_df["fre_id"] == id]
    if len(mapping) == 0 or mapping["eng_id"].values[0] == "NaN":
        return None
    return mapping["eng_id"].values[0], mapping["eng_docname"].values[0]

print(get_mapping("001-222891"))
print(get_mapping("001-222892"))
print(get_mapping("001-22293"))

In [None]:
df = pd.read_csv("data/echr_case_law_guides_with_possible_citations.csv")

for i, row in df.iterrows():
    possible_citations = row["possible_citations"]
    possible_citations = json.loads(possible_citations)

    mapped_citations = {}
    for k, v in possible_citations.items():
        mapping = get_mapping(k)
        if mapping:
            mapped_citations[mapping[0]] = mapping[1]
        else:
            mapped_citations[k] = v

    df.at[i, "possible_eng_citations"] = json.dumps(mapped_citations)

df.to_csv("data/echr_case_law_guides_with_possible_eng_citations.csv", index=False)

In [None]:
df = pd.read_csv("data/echr_case_law_guides_with_possible_eng_citations.csv")

for i, row in df.iterrows():
    possible_eng_citations = row["possible_eng_citations"]
    possible_citations = row["possible_citations"]

    if possible_eng_citations != possible_citations:
        print(possible_citations)
        print(possible_eng_citations)