# Judgement Text collection


### Process:

1. Download judgement doucuments from HUDOC API

2. Preprocess judgement documents 

3. Sort texts by Article

In [None]:
import json
from os import listdir, mkdir
from os.path import isfile, join
import requests
import pandas as pd
from docx import Document
from docx.shared import Inches
from docx.text.run import Run
import zipfile
import re
import shutil
from ast import literal_eval

### 1. Downloading documents

- get corresponding judgement documents from HUDOC API

In [None]:
# get case id from case information 
case_info = pd.read_csv('./case_info/case_info.csv')
case_id_lst = list(case_info.itemid)

In [None]:
output_folder = './raw_documents' # folder to save raw docs
os.mkdir(output_folder)
failed_case = [] # failed requests
update == False
# urls 
base_url = "http://hudoc.echr.coe.int/app/conversion/docx/?library=ECHR&filename=please_give_me_the_document.docx&id="
perm_url = "http://hudoc.echr.coe.int/eng?i="

# get documents
for i, case_id in enumerate(case_id_lst):
    ## tracking the process 
    print("Document {}/{}: {}".format(i, len(case_id_lst), case_id))
    filename = "%s.docx"%(case_id.strip())
    filepath = join(output_folder, filename)
    ## download the file if no such a file exists or update == True
    if update or not isfile(filepath):
        url = base_url + case_id.strip()
        res = requests.get(url, stream = True)
        if not res.ok:
            print("Failed to fetch document %s"%(case_id))
            failed_case.append(case_id)
            print("URL: %s"%(url))
            print("Permalink: %s"%(perm_url + case_id.strip()))
            continue
        with open(filepath, 'wb') as f:
            for block in res.iter_content(1024):
                f.write(block)
            print("Request complete, see %s"%(filepath))
    else:
        print("Skip as document exists already")           

### 2. Preprocessing the documents 

- parsing the MS Word document to extract judgement text without the the entire text of the court decision :
1. Conclusion - the text includes information about labels i.e, 'violation' or 'no violation'
2. Law -  the text includes arguments and discussions of judges that partly contain the final decision

In [None]:
TMP = '/tmp/echr_tmp_doc'

# possible tags for each type of section
tags = {
    "SECTION_TITLE_STYLE": ['ECHR_Title_1', 'Ju_H_Head'],
    "HEADING_1_STYLE": ['ECHR_Heading_1', 'Ju_H_I_Roman'],
    "HEADING_2_STYLE" :['ECHR_Heading_2', 'Ju_H_A', 'Ju_H_a'],
    "HEADING_3_STYLE" :['ECHR_Heading_3', 'Ju_H_1.', 'Ju_H_1'],
    "HEADING_PARA": ['ECHR_Para', 'ECHR_Para_Quote', 'Ju_List',\
        'Ju_List_a', 'Ju_Para', 'Normal', 'Ju_Quot', 'Ju_H_Article',\
        'Ju_Para Char Char', 'Ju_Para Char', 'Ju_Para_Last', 'Opi_Para'],
    "DECISION_BODY": ['ECHR_Decision_Body', 'Ju_Judges']
}

# different level of document to parse
levels = {
    "DECISION_BODY": -1,
    "SECTION_TITLE_STYLE": 1,
    "HEADING_1_STYLE": 2,
    "HEADING_2_STYLE": 3,
    "HEADING_3_STYLE": 4,
    "HEADING_PARA": 5
}

tag_to_level = {}
for k, v in tags.items():
    for t in v:
        tag_to_level[t] = levels[k]

OLD_PARSER_TAGS = ['header', 'Normal', 'Body Text 2', 'Body Text Indent 3', 'OldCommission', 'Heading 6', 'Heading 5', 'Heading 4']

internal_section_reference = {
    'toc': ["Table of contents"],
    'abbreviations': ["ABBREVIATIONS AND ACRONYMS"],
    'introduction': ["INTRODUCTION"],
    'procedure': ["CLAIMS MADE BY THE APPLICANTS", "I.  Locus standı", "PROCEDURE", "PROCEDURE”", "AS TO PROCEDURE",
                  "PROCEDURE AND FACTS", "FACTS AND PROCEDURE", "I.   THE GOVERNMENT’S PRELIMINARY OBJECTION"],
    'facts': ["THE FACTS", "AS TO THE FACTS", "COMPLAINTS", "COMPLAINT", "FACTS",
              "THE FACT", "THE FACTSITMarkFactsComplaintsStart"
              "THE CIRCUMSTANCES OF THE CASE",
              "I.  THE CIRCUMSTANCES OF THE CASE",
              "I. THE PARTICULAR CIRCUMSTANCES OF THE CASE"
              'PROCEEDINGS', "PROCEEDINGS BEFORE THE COMMISSION",
              "II. PROCEEDINGS BEFORE THE COMMISSION",
              "PROCEEDINGS BEFORE THE COMMISSION  17."
              ],
    'law': ["THE LAW",
            "LAW",
            "IV.  COMPLIANCE WITH THE EXHAUSTION RULE",
            "THE LAWS ON THE USE OF LANGUAGES IN EDUCATION IN",
            "AS TO THE LAW",
            "TO THE LAW",
            "III. THE LAW",
            "IN LAW",
            "APPLICATION OF ARTICLE",
            "II.  APPLICATION OF ARTICLE",
            "IV.  COMPLIANCE WITH THE EXHAUSTION RULE",
            "IV.  OTHER COMPLAINTS UNDER ARTICLE",
            "I. ALLEGED LACK OF STANDING AS",
            "ITMarkFactsComplaintsEndTHE LAW",
            "ALLEGED VIOLATION OF ARTICLE",
            "AS TO THE  ALLEGED VIOLATION OF ARTICLE",
            "I.  ALLEGED VIOLATION OF ARTICLE",
            "III.  ALLEGED VIOLATION OF ARTICLE",
            "THE ALLEGED BREACHES OF ARTICLE",
            "II.   ALLEGED VIOLATION OF ARTICLE"
            "MERITS", "II.  MERITS", "III.  MERITS"
            ],
    'conclusion': ["CONCLUSION",
                   "THE COURT UNANIMOUSLY",
                   "REASONS, THE COURT, UNANIMOUSLY,",
                   "FOR THESE REASONS, THE COURT UNANIMOUSLY",
                   "FOR THESE REASONS, THE COURT ,UNANIMOUSLY,",
                   "FOR THESE REASONS, THE COURT, UNANIMOUSLY,",
                   "FOR THESE REASONS, THE COURT UNANIMOUSLY,",
                   "FOR THESE REASONS, THE COURT,UNANIMOUSLY,",
                   "FOR THESE REASONS, THE COURT, UNANIMOUSLY",
                   "FOR THESE REASONS THE COURT UNANIMOUSLY",
                   "FOR THESE REASONS, THE COURT UNANIMOUSLY:",
                   "FOR THESE REASONS, THE COUR, UNANIMOUSLY,",
                   "FOR THESE REASONS THE COURT",
                   "FOR THESE RASONS, THE COURT UNANIMOUSLY",
                   "FOR THESE REASONS, THE COURT:",
                   "FOR THE REASONS, THE COURT",
                   "THE COURT",
                   "FOR THESE REASONS, THE COURT,",
                   "FOR THESE REASONS, THE COURT"],
    'relevant_law': ["RELEVANT DOMESTIC LAW",
                     "II.  RELEVANT DOMESTIC LAW",
                     "RELEVANT DOMESTIC LEGAL FRAMEWORK",
                     "III.  RELEVANT ELEMENTS OF COMPARATIVE LAW",
                     "II. RELEVANT DOMESTIC LAW",
                     "II. RELEVANT DOMESTIC LAW AND PRACTICE",
                     "RELEVANT DOMESTIC LAW AND CASE-LAW",
                     "III.  RELEVANT INTERNATIONAL MATERIALS",
                     "RELEVANT international material",
                     "II.  RELEVANT DOMESTIC LAW AND PRACTICE",
                     "RELEVANT DOMESTIC AND INTERNATIONAL LAW",
                     "III.  RELEVANT INTERNATIONAL MATERIAL",
                     "II.  RELEVANT DOMESTIC LAW AND PRACTICE AND INTERNATIONAL MATERIALS"
                     "RELEVANT DOMESTIC LAW AND PRACTICE",
                     "RELEVANT EUROPEAN UNION LAW",
                     'relevant legal framework',
                     "RELEVANT LEGAL FRAMEWORK AND PRACTICE",
                     "III.  COMPARATIVE LAW AND PRACTICE",
                     "RELEVANT LEGAL FRAMEWORK AND INTERNATIONAL MATERIAL",
                     "RELEVANT LEGAL and factual FRAMEWORK",
                     "RELEVANT LEGAL FRAMEWORK and the council of europe material",
                     "Council of europe material",
                     "LEGAL FRAMEWORK",
                     "III.  RELEVANT INTERNATIONAL LAW",
                     "RELEVANT COUNCIL OF EUROPE DOCUMENTS",
                     "III.  RELEVANT COUNCIL OF EUROPE INSTRUMENTS",
                     "II.  RELEVANT INTERNATIONAL MATERIAL"],
    "opinion": ["STATEMENT OF DISSENT BY JUDGE KŪRIS",
                "JOINT CONCURRING OPINION OF JUDGES YUDKIVSKA, VUČINIĆ, TURKOVIĆ AND HÜSEYNOV",
                "JOINT PARTLY DISSENTING OPINION OF JUDGES RAIMONDI, SICILIANOS, KARAKAS, VUČINIĆ AND HARUTYUNYAN",
                "PARTLY DISSENTING OPINION OF JUDGE DE GAETANO, JOINED BY JUDGE VUČINIĆ",
                "PARTLY DISSENTING OPINION OF JUDGE KŪRIS",
                "PARTLY DISSENTING OPINION OF JUDGE GROZEV",
                "DISSENTING OPINION OF JUDGE KOSKELO",
                "CONCURRING OPINION OF JUDGE PINTO DE ALBUQUERQUE",
                "DISSENTING OPINION OF JUDGE BAKA",
                "PARTLY DISSENTING OPINION OF JUDGE SICILIANOS",
                "PARTLY DISSENTING OPINION OF JUDGE EICKE",
                "PARTLY DISSENTING OPINION OF JUDGE EICKE",
                "CONCURRING OPINION OF JUDGE JEBENS",
                "CONCURRING OPINION OF JUDGE GÖLCÜKLÜ",
                "ConcurRing opinion of Judge Bonello",
                "CONCURRING OPINION OF JUDGE SERGHIDES",
                "DISSENTING OPINION OF JUDGE SERGHIDES",
                "DISSENTING OPINION OF JUDGE ROZAKIS",
                "PARTLY DISSENTING OPINION OF JUDGE GÖLCÜKLÜ",
                "JOINT DISSENTING OPINION OF JUDGES GROZEV AND O’LEARY",
                "JOINT PARTLY DISSENTING OPINION OF JUDGES LOUCAIDES AND TULKENS"],
    "appendix": ['APPENDIX', "APPENDIX: LIST OF APPLICANTS", "APPENDIX 1", "ANNEX",
                 "APPENDIX 2", "ANNEX 1:", "ANNEX 2:", "Annex I", "Annex II", "Appendix to the judgment"],
    "submission": ["FINAL SUBMISSIONS TO THE COURT",
                   "THE GOVERNMENT’S FINAL SUBMISSIONS TO THE COURT",
                   "FINAL SUBMISSIONS BY THE GOVERNMENT TO THE COURT",
                   "FINAL SUBMISSIONS SUBMITTED TO THE COURT BY THE GOVERNMERNT",
                   "DISSENTING OPINION OF JUDGE SCHEMBRI ORLAND",
                   "GOVERNMENT’S FINAL SUBMISSIONS TO THE COURT",
                   "FINAL SUBMISSIONS TO THE COURT BY THE GOVERNMENT",
                   "FINAL SUBMISSIONS MADE TO THE COURT",
                   "FOR THESE REASONS, THE COUR",
                   "SUBMISSIONS OF THE PARTIES",
                   "CONCLUDING SUBMISSIONS MADE TO THE COURT",
                   "CONCLUDING SUBMISSIONS MADE TO THE COURT",
                   "THE GOVERNMENT’S SUBMISSIONS TO THE COURT",
                   "THE GOVERNMENT’S FINAL SUBMISSIONS",
                   "FINAL SUBMISSIONS PRESENTED BY THE GOVERNMENT",
                   "FINAL SUBMISSIONS PRESENTED TO THE COURT",
                   "FINAL SUBMISSIONS AND OBSERVATIONS MADE TO THE COURT",
                   "FINAL SUBMISSIONS AND OBSERVATIONS MADE TO THE COURT",
                   "FINAL SUBMISSIONS MADE TO THE COURT BY THE GOVERNMENT",
                   "FINAL SUBMISSIONS MADE BY THE GOVERNMENT TO THE COURT",
                   "SUBMISSIONS MADE BY THE GOVERNMENT TO THE COURT",
                   "CONCLUDING SUBMISSIONS BY THE GOVERNMENT",
                   "FINAL SUBMISSIONS MADE BY THE GOVERNMENT",
                   "FINAL SUBMISSIONS BY THOSE APPEARING BEFORE THE COURT"],
    'schedule': ["SCHEDULE"]
}


def tag_elements(parsed):
    """
        Tag the elements in the parsed document.
        Tag the elements in the parsed documents
        according to some predifined sections.
        :param parsed: parsed document
        :type parsed: dict
        :return: parsed document with internal section references
        :rtype: dict
    """
    for i, section in enumerate(parsed['elements']):
        for section_reference, values in internal_section_reference.items():
            if any(section['content'].strip().upper().startswith(v.upper()) for v in values):
                parsed['elements'][i]['section_name'] = section_reference
                break
        #if not 'section_name' in parsed['elements'][i]:
        #    print('Could not tag section {}'.format(section['content']))
        #    print(section['content'])
    return parsed


def format_title(line):
    """Format title
        :param line: line to format as title
        :type line: str
        :return: formatted title
        :rtype: str
    """
    m = re.match(r'(\w+)\.(.+)', line)
    if m:
        return m.group(2).strip()
    else:
        return line

def parse_body(body):
    """Extract body members
        :param body: line to extract the body members from
        :type body: str
        :return: list of members with their role
        :rtype: [dict]
    """
    members = []
    body = body.replace('\nand ', '\n')
    body = body.replace('\t', '')
    body = body.split('\n')
    body = [b for b in body if len(b)]

    roles = []
    k = 0
    for i, t in enumerate(body):
        a = [j for j in t.split(',') if len(j)]
        members.append({'name': a[0]})
        if len(a) > 1:
            roles.append((k, i, a[1].lower().strip()))
            k = i + 1

    for r in roles:
        for i, m in enumerate(members[r[0]:r[1]+1]):
            members[r[0] + i]['role'] = r[2]

    return members


class Node:
    """Represent a rooted tree
    """

    def __init__(self, parent=None, level=0, content=None):
        self.parent = parent
        self.level = level
        self.content = content
        self.elements = []


def parse_document(doc):
    """Parse a document object to a tree
        :param doc: document object
        :type doc: Document
        :return: tree
        :rtype: Node
    """
    parsed = {}

    decision_body = ""
    appender = Node() # Top level node
    for p in doc.paragraphs:
        line = p.text.strip()
        if not len(line):
            continue
        #print(p.style.name, p.text)
        level = tag_to_level.get(p.style.name, 0)
        if level > 0:
            if appender.level == 0 and not len(appender.elements) and level > 1:
                pass
                #print('HEADER')
            else:
                #print('L {} | App level: {}'.format(level, appender.level))
                if level < appender.level:
                    while(appender.level > level - 1):
                        appender = appender.parent
                elif level == appender.level:
                    appender = appender.parent
                node = Node(parent=appender, level=level, content=p.text)
                appender.elements.append(node)
                appender = node

        if level < 0:
            if level == -1:
                #print(p.text)
                decision_body += p.text
        #else:
        #    print(p.style.name, p.text)
    

    root = appender
    while(root.level != 0):
        root = root.parent

    def print_tree(root):
        """Utilitary function to print tree
            :param root: root of the tree
            :type root: Node
        """
        print("LEVEL {} {} {}".format(root.level, ' ' * root.level * 2, root.content.encode('utf-8') if root.content else 'ROOT'))
        if len(root.elements) == 0:
            return
        else:
            for e in root.elements:
                print_tree(e)

    def tree_to_json(root, res):
        """Recursively convert a tree into json
            :param root: root of the tree
            :type root: Node
            :param res: where to store result
            :type: res: dict
            :return: remaining tree
            :rtype: Node
        """
        node = {
            'content': root.content,
            'elements': []
        }
        for e in root.elements:
            node['elements'].append(tree_to_json(e, node))
        return node

    parsed = {'elements': []}
    parsed['elements'] = tree_to_json(root, parsed)['elements']
    parsed['decision_body'] = parse_body(decision_body) if decision_body else []
    parsed = tag_elements(parsed)

    return parsed

PARSER = {
    'old': 'OLD',
    'new': 'NEW'
}

def format_paragraph(p):
    """Format paragraph
        :param line: line to format as title
        :type line: str
        :return: formatted title
        :rtype: str
    """
    match = re.search(r'^(?:\w+\.)(.+)', p)
    if match is not None:
        return match.group(1).strip()
    else:
        return p
    
def json_to_text_(doc, text_only=True, except_section=[]):
    res = []
    if not len(doc['elements']):
        res.append(format_paragraph(doc['content']))
    # text_only: remove the titles
    for e in doc['elements']:
        if not 'section_name' in e or e['section_name'] not in except_section:
            res.extend(json_to_text_(e, text_only=True, except_section=except_section))
    return res

def json_to_text(doc, text_only=True, except_section=[]):
    """Format json to text 
        :param doc: parsed document
        :type doc: dict
        :param text_only: return only text
        :type text_only: bool
        :param except_section: list of section to discard
        :type: except_section: list
        :return: textual representation of the document
        :rtype: str
    """
    except_section = [] if except_section is None else except_section
    return '\n'.join(json_to_text_(doc, text_only, except_section))

def select_parser(doc):
    """Select the parser to be used for a given document
        :param doc: document
        :type doc: Document
        :return: parser name
        :rtype: str
    """
    if all([True if p.style.name in OLD_PARSER_TAGS else False for p in doc.paragraphs]):
        return PARSER['old']
    else:
        return PARSER['new']
    
def update_docx(docname):
    """Update a docx such that it can be read by docx library.
        MSWord documents are a zip folder containing several XML files.
        As docx library cannot read 'smartTag', it is required to remove them.
        To do so, we open the zip, access the main XML file and manually sanitize it.
        :param docname: path to the document
        :type docname: str
        :return: path to the new document
        :rtype: str
    """
    # Remove temporary folder and files
    try:
        shutil.rmtree(TMP)
    except:
        pass

    try:
        os.rm('./_proxy.docx')
    except:
        pass

    # Extract the document
    zip_ref = zipfile.ZipFile(docname, 'r')
    zip_ref.extractall(TMP)
    zip_ref.close()

    # Sanitize
    with open(os.path.join(TMP, 'word/document.xml'), 'r') as file:
        content = file.read()
        lines = content.split('>')
        remove_open = True
        for i, l in enumerate(lines):
            if '<w:smartTag ' in l and remove_open:
                del lines[i]
                remove_open = False
            if '</w:smartTag'==l and not remove_open:
                del lines[i]
                remove_open = True
        file.close()
    content = '>'.join(lines)

    # Recompress the archive
    with open(os.path.join(TMP, 'word/document.xml'), 'w') as file:
        file.write(content)
    shutil.make_archive('./proxy', 'zip', TMP)

    output_file = './_proxy.docx'
    os.rename('./proxy.zip', output_file)

    try:
        os.rm('./_proxy.docx')
    except:
        pass
    
    return output_file


In [None]:
input_folder = './raw_documents'
mkdir(output_folder)
output_folder = './preprocessed_documents'
case_id_lst = list(case_info.itemid)

# stat of parser type used
stats = {
    'parser_type':{
        'OLD': 0,
        'NEW': 0
    }
}

update = False
correctly_parsed = 0
failed = []
files = [join(input_folder, f) for f in listdir(input_folder) if isfile(join(input_folder, f)) if '.docx' in f]
# process documents
for i, f in enumerate(files):
    id_doc = f.split('/')[-1].split('.')[0]
    print('Process document {} {}/{}'.format(id_doc, i, len(files)))
    filename_parsed = os.path.join(output_folder, '{}_Judgement_text.txt'.format(id_doc))
    if update or not os.path.isfile(filename_parsed):
        try:
            f_updated = update_docx(f)
            doc = Document(f_updated)
            parser = select_parser(doc)
            stats['parser_type'][parser] +=1
            if parser == 'NEW':
                parsed = parse_document(doc)
                with open(os.path.join(output_folder, '{}_Judgement_text.txt'.format(id_doc)), 'wb') as toutfile:
                    toutfile.write(json_to_text(parsed, True, ['law','conclusion']).encode('utf-8'))
                    parsed['documents'] = ['{}.docx'.format(id_doc)]
                    parsed['content'] = {
                        '{}.docx'.format(id_doc): parsed['elements']
                        }
                    del parsed['elements']
                    correctly_parsed += 1
            else:
                raise Exception("OLD parser is not available yet.")
        except Exception as e:
            failed.append((id_doc, e))    
            print("{} {}".format(f, e))
    else:
        print('Skip document because it is already processed')
        correctly_parsed += 1

    print('Correctly parsed: {}/{} ({}%)'.format(correctly_parsed, len(files), (100. * correctly_parsed) / len(files)))
    print('List of failed documents:')
    
    for e in failed:
        print('{}: {}'.format(e[0], e[1]))

 ### 3. Sorting the texts by Article 
 
 - allocate parsed texts to corresponding outcome and Article folders.

In [None]:
def remove_dup(duplicate):
    final_list = []
    for item in duplicate:
        if item not in final_list:
            final_list.append(item)
    return final_list


def sort_docs(doc_folder, violation_folder, violation):
    files = listdir(doc_folder)
    for item_id in violation:
        item_path = '{}_Judgement_text.txt'.format(item_id)
        file_path = join(doc_folder, item_path)
        if item_path in files:
            output_path = join(violation_folder, item_path)
            shutil.copy(file_path, violation_folder)

In [None]:
input_folder = ('./case_info')
doc_folder = './preprocessed_documents'
output_folder = './docs_per_article'

# check if there's directory
#if path.isdir(output_folder):
    #shutil.rmtree(output_folder)
    #mkdir(output_folder)
#else: 
mkdir(output_folder)
    
# get article list from file names stored in case_info folder
files = [join(input_folder, f) for f in listdir(input_folder) if isfile(join(input_folder, f)) if '.csv' in f]
article_lst = [f for f in files if f.find('info_') > 0]  

for a in article_lst:
    article = a[22:-4]
    output_path = join(output_folder, 'article_{}'.format(article))
    mkdir(output_path)
    ### iterate through rows in conclusion column and find the case outcome
    violation = []
    no_violation = []
    df = pd.read_csv(a) ### read df for specific article 
    df.conclusion = df.conclusion.apply(lambda x: literal_eval(x)) # convert it to the original list
    for i in range(len(df)): ## iterate through each element in conclusion list 
        for c in df.conclusion[i]:
            if 'base_article' in c:
                art_num = c['base_article']
            if art_num == article and c['type'] == 'violation':
                violation.append(df.itemid[i])
                violation = remove_dup(violation) ### remove duplicated cases
            elif art_num == article and c['type'] == 'no-violation':
                no_violation.append(df.itemid[i])
                no_violation = remove_dup(no_violation)
    ### make a df and save in each article folder
    violation_lst = ['violation'] * len(violation)
    no_violation_lst = ['no-violation'] * len(no_violation)
    concat_outcome_lst = violation_lst + no_violation_lst ### outcome list
    concat_id_lst = violation + no_violation ### case id list
    df_outcome = pd.DataFrame(list(zip(concat_id_lst, concat_outcome_lst)), columns = ['Itemid', 'Judgement'])
    df_outcome.to_csv(join(output_path, 'case_outcome.csv'.format(article)), index = False)
    
  # sort judgement documents
    violation_folder = join(output_path, 'violation')
    no_violation_folder = join(output_path, 'no-violation')
    mkdir(violation_folder)
    mkdir(no_violation_folder)
    sort_docs(doc_folder, violation_folder, violation)
    sort_docs(doc_folder, no_violation_folder, no_violation)

### References 

[1] A. Quemy and R. Wrembel,"On Integrating and Classifying Legal Text Documents", International Conference on Database and Expert Systems Applications (DEXA)(2020)