# Case information collection


### Process:

1. parse case information from HUDOC API

2. filter case information 

3. clean case information

4. sort case information by article

5. store in PostgreSQL database

In [1]:
from selenium import webdriver 
from time import sleep
import requests
import os
import shutil
import json
import pandas as pd
import numpy as np
import copy
import psycopg2
from sqlalchemy import create_engine
from psycopg2.extras import Json

### 1. Retrieving the case information

In [None]:
'''pulls maximum number of documents from HUDOC.'''

# use safari as a webdriver 
driver = webdriver.Safari()

# get the number of available docs in HUDOC 
url = "https://hudoc.echr.coe.int/eng#%20"
driver.implicitly_wait(10)
driver.get(url)
result = driver.find_element_by_class_name('resultNumber')

In [39]:
## text to int
max_docs = int(result.text)
max_docs

172819

In [None]:
'''pulls case information from HUDOC API.'''

# Fields to retrieve 
fields = [
    "sharepointid", 
    "Rank", 
    "itemid", 
    "docname", 
    "doctype", 
    "application", 
    "appno", 
    "conclusion", 
    "importance", 
    "originatingbody", 
    "typedescription", 
    "kpdate", 
    "kpdateAsText", 
    "documentcollectionid", 
    "documentcollectionid2", 
    "languageisocode", 
    "extractedappno", 
    "isplaceholder", 
    "doctypebranch", 
    "respondent", 
    "respondentOrderEng", 
    "ecli", 
    "article", 
    "applicability", 
    "decisiondate", 
    "externalsources", 
    "introductiondate", 
    "issue", 
    "judgementdate", 
    "kpthesaurus", 
    "meetingnumber", 
    "representedby", 
    "separateopinion", 
    "scl"  
]

base_url = "http://hudoc.echr.coe.int/app/query/results?query=((((((((((((((((((((%20contentsitename%3AECHR%20AND%20(NOT%20(doctype%3DPR%20OR%20doctype%3DHFCOMOLD%20OR%20doctype%3DHECOMOLD)))%20XRANK(cb%3D14)%20doctypebranch%3AGRANDCHAMBER)%20XRANK(cb%3D13)%20doctypebranch%3ADECGRANDCHAMBER)%20XRANK(cb%3D12)%20doctypebranch%3ACHAMBER)%20XRANK(cb%3D11)%20doctypebranch%3AADMISSIBILITY)%20XRANK(cb%3D10)%20doctypebranch%3ACOMMITTEE)%20XRANK(cb%3D9)%20doctypebranch%3AADMISSIBILITYCOM)%20XRANK(cb%3D8)%20doctypebranch%3ADECCOMMISSION)%20XRANK(cb%3D7)%20doctypebranch%3ACOMMUNICATEDCASES)%20XRANK(cb%3D6)%20doctypebranch%3ACLIN)%20XRANK(cb%3D5)%20doctypebranch%3AADVISORYOPINIONS)%20XRANK(cb%3D4)%20doctypebranch%3AREPORTS)%20XRANK(cb%3D3)%20doctypebranch%3AEXECUTION)%20XRANK(cb%3D2)%20doctypebranch%3AMERITS)%20XRANK(cb%3D1)%20doctypebranch%3ASCREENINGPANEL)%20XRANK(cb%3D4)%20importance%3A1)%20XRANK(cb%3D3)%20importance%3A2)%20XRANK(cb%3D2)%20importance%3A3)%20XRANK(cb%3D1)%20importance%3A4)%20XRANK(cb%3D2)%20languageisocode%3AENG)%20XRANK(cb%3D1)%20languageisocode%3AFRE&select={}&sort=&rankingModelId=4180000c-8692-45ca-ad63-74bc4163871b".format(','.join(fields))
length = 500 ## max items per request

# make a directory to save the request output
output_folder = './raw_cases_info'
if os.path.exists(output_folder):
    shutil.rmtree(output_folder)
os.mkdir(output_folder)

# request case information from the HUDOC api
## tracking of failed cases
number_failed = 0 
output_folder = './raw_cases_info'

for start in range(0, max_docs, length):
    with open(os.path.join(output_folder, "%d.json"%(start)), 'wb') as f:
        url = base_url + "&start=%d&length=%d"%(start, length)
        res = requests.get(url, stream = True)
        if not res.ok:
            print('failed to fetch information {} to {}'.format(start, start + length))
            number_failed += 1
            continue
        for chunk in res.iter_content(chunk_size = 1024):
            f.write(chunk)

### 2. Filtering the case information
For this project, we keep only cases

- in English
- with an attatched judgement document
- with a clear conclusion (clearly stating "violation" or "no violation" at least once) 

In [2]:
# get all case info 
cases = []
input_folder = './raw_cases_info'
## get all the directories
files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if os.path.isfile(os.path.join(input_folder, f)) if '.json' in f]
for i in files:
    with open(i, 'r') as f:
        content = f.read()
        index = json.loads(content)
        cases.extend(index['results'])
cases = [c['columns'] for c in cases]

In [3]:
# filter cases
total = len(cases)
total
## remove non-english cases
cases = [c for c in cases if c["languageisocode"] == "ENG"]
## remove cases with no judgement document
cases = [c for c in cases if c["doctype"] == "HEJUD"]
## remove cases without an attached judgement document
cases = [c for c in cases if c["application"].startswith("MS WORD")]
## remove cases without a clear conclusion
cases = [c for c in cases if "No-violation" in c["conclusion"] or "No violation" in c["conclusion"] or "Violation" in c["conclusion"] or "violation" in c["conclusion"]]
## remove a specific list of cases hard to process
cases = [c for c in cases if c['itemid'] not in ["001-154354", "001-108395", "001-79411"]]
total

171831

### 3. Cleaning the case information

- parse and format some raw information.

In [4]:
def parties(docname):
    ''' Parse the case title(docname) and return the list of parties.
    
    parameter
    ---------
    docname : str 
              string containing the parties name 
    '''
    if docname.startswith('CASE OF '):
        docname = docname[len('CASE OF '):]
    if docname[-1] == ')':
        docname = docname.split('(')[0]
    parties = docname.split(' v.')
    parties = [p.strip() for p in parties]
    
    return parties

def split_article(article):
    ''' Parse string containing the list of articles into 
        the list of articles. 
    
    parameter
    ---------
    article : str 
              string containing the list of articles 
    '''

    parts = article.split('+')
    articles = [parts[-1]]
    for k, e in enumerate(parts[:-1]):
        if not parts[k + 1].startswith(e):
            articles.append(e)
    return articles

def base_article(articles):
    ''' Find the base articles from a list of articles 
    
    parameter
    ---------
    articles : str 
               list of articles 
    '''
    base_articles = []
    for a in articles:
        a = a.split('+')[0]
        if 'p' not in a.lower():
            base_articles.append(a.split('-', 1)[0])
        else:
            base_articles.append('-'.join(a.split('-')[0:2]))

    return base_articles

def article(article):
    ''' Format the list of articles.

    parameter
    ---------
    article : str
              string containing the list of articles

    '''
    articles = article.lower().split(';')
    return list(set(base_article(
        [item for sublist in list(map(split_article, articles)) for item in sublist])))


def subarticle(article):
    ''' Format the list of subarticles.

    parameter
    ---------
    article : str
              string containing the list of articles

    '''
    articles = article.split(';')
    articles = [a for sublist in articles for a in sublist.split('+')]
    res = list(set(articles))
    return res

def merge_conclusion_elements(elements):

    ''' Merge similar conclusion elements in a single one, more descriptive
    
    parameter
    ----------
    elements : dict
               conclusion elements
    '''

    final_elements = {}
    for e in elements:
        if 'article' in e and 'base_article' in e:
            key = '{}_{}_{}'.format(e['article'], e['base_article'], e['element'])
        else:
            key = e['element']
        if key not in final_elements:
            final_elements[key] = e
        final_elements[key].update(e)
    return list(final_elements.values())

def get_element_type(l):
    t = 'other'
    if l.startswith('violation'):
        t = 'violation'
    elif l.startswith('no-violation') or l.startswith('no violation'):
        t = 'no-violation'
    return t

def format_conclusion_elements(i, e, final_ccl):
    to_append = []
    l = e['element'].lower().strip()

    # Determine type
    t = get_element_type(l)
    final_ccl[i]['type'] = t
    if t == 'other':
        to_append.append(final_ccl[i])

    # Determine articles
    articles = []
    if 'protocol' in e['element'].lower():
        prot = e['element'].lower().split('protocol no.')
        f1 = prot[0].split()[-2]
        f2 = prot[1].split()[0]
        final_ccl[i]['article'] = f'p{f2}-{f1}'
        articles = split_article(final_ccl[i]['article'])

    if 'article' not in final_ccl[i] and t != 'other':
        art = None
        find_and_replace = [
            (' and art. ', ''),
            (' and of ', '+'),
            (' and ', '+')
        ]
        for p in find_and_replace:
            if p[0] in l:
                l = l.replace(p[0], p[1])

        b = l.split()
        for j, a in enumerate(b):
            if a.startswith('art'):
                if a.lower().startswith('art.') and not a.lower().startswith('art. ') and len(a) > 4:
                    art = a.lower()[4:]
                else:
                    art = b[j + 1]
                break
        if art is not None:
            articles = split_article(art)
            art = art.split('+')
            if '+' in art[0]:
                sart = art[0].split('+')
                t = [sart[-1]]
                for k, e in enumerate(sart[:-1]):
                    if not sart[k + 1].startswith(e):
                        t.append(e)

    base_articles = base_article(articles)
    for k, art in enumerate(articles):
        item = copy.copy(final_ccl[i])
        item['article'] = art
        item['base_article'] = base_articles[k]
        to_append.append(item)

    return to_append


def conclusion(ccl):
    ''' Convert a conclusion string into a list of elements
    
    parameter
    ---------
    ccl : str
          conclusion string 
    
    '''
    final_ccl = []
    chunks = [c for c in ccl.split(')') if len(c)]
    art = []
    for c in chunks:
        if '(' not in c:
            art.extend(c.split(';'))
        else:
            art.append(c)
    art = [a for a in art if len(a) > 0]
    for c in art:
        a = c.split('(')
        b = a[1].split(';') if len(a) > 1 else None
        articles = [d.strip() for d in a[0].split(';')]
        articles = [d for d in articles if len(d) > 0]
        if not len(articles):
            if b:
                if 'mentions' in final_ccl[-1]:
                    final_ccl[-1]['mentions'].extend(b)
                else:
                    final_ccl[-1]['mentions'] = b
            continue
        article = articles[-1] if not articles[-1].startswith(';') else articles[-1][1:]
        conclusion = {'element': article }
        if b:
            conclusion['details'] = b
        if len(article.strip()) == 0:
            if b is not None:
                final_ccl[-1]['mentions'] = b
        else:
            final_ccl.append(conclusion)
    if len(articles) > 1:
        for a in articles[:-1]:
            if len(a) > 0:
                final_ccl.append({'element': a})

    to_append = []
    for i, e in enumerate(final_ccl):
        to_append.extend(format_conclusion_elements(i, e, final_ccl))

    final_ccl = merge_conclusion_elements(to_append)
    
    return final_ccl

# apply parsing and formatting
for i, c in enumerate(cases):
        cases[i]['parties'] = parties(cases[i]['docname'])
        cases[i]['conclusion_detail'] = cases[i]['conclusion']
        cases[i]['conclusion'] = conclusion(c['conclusion_detail'])
        cases[i]['articles_detail'] = cases[i]['article']
        cases[i]['article'] = article(cases[i]['articles_detail'])
        cases[i]['paragraphs'] = subarticle(cases[i]['articles_detail'])
        cases[i]['externalsources'] = cases[i]["externalsources"].split(';') if len(
            cases[i]['externalsources']) > 0 else []
        cases[i]["documentcollectionid"] = cases[i]["documentcollectionid"].split(';') if len(
            cases[i]['documentcollectionid']) > 0 else []
        cases[i]["issue"] = cases[i]["issue"].split(';') if len(cases[i]['issue']) > 0 else []
        cases[i]["representedby"] = cases[i]["representedby"].split(';') if len(
            cases[i]['representedby']) > 0 else []
        cases[i]["extractedappno"] = cases[i]["extractedappno"].split(';')

        cases[i]['externalsources'] = [e.strip() for e in cases[i]['externalsources']]
        cases[i]['documentcollectionid'] = [e.strip() for e in cases[i]['documentcollectionid']]
        cases[i]['issue'] = [e.strip() for e in cases[i]['issue']]
        cases[i]['representedby'] = [e.strip() for e in cases[i]['representedby']]
        cases[i]['extractedappno'] = [e.strip() for e in cases[i]['extractedappno']]

all_cases = cases

### 4. Sorting the case information by article

- sort case information by each unique article with at least 100 associated cases.

In [203]:
# clean cases before sorting
cleaned_cases = []
for c in cases:
    labels = []
    for i in c['conclusion']:
        if i['type'] in ['violation', 'no-violation']:
            if 'article' in i:
                j = i['article']
                labels.append('{}:{}'.format(j, 1 if i['type'] == 'violation' else 0))
    labels = list(set(labels)) 
    # double checking if wrong assignment of label
    opposed_labels = any([i for i in labels if i.split(':')[0] + ':'+ str(abs(1-int(i.split(':')[-1])))in labels])
    if len(labels) > 0 and not opposed_labels:
        cleaned_cases.append(c)

# count number of cases per article
outcomes = {}
cases_per_articles = {}
## iterate through the cleaned cases and count number of judgement outcome
for i, c in enumerate(cleaned_cases):
    ccl = c['conclusion']
    for d in ccl:
        if d['type'] in ['violation', 'no-violation']:
            if d['base_article'] not in outcomes:
                ### dictionary for counting
                outcomes[d['base_article']] = {
                    'violation' : 0,
                    'no-violation' : 0,
                    'total' : 0
                }
            outcomes[d['base_article']][d['type']] += 1
            outcomes[d['base_article']]['total'] += 1
            ### sort cases by articles  
            if d['base_article'] not in cases_per_articles:
                cases_per_articles[d['base_article']] = []
            cases_per_articles[d['base_article']].append(c)

# filter articles with less than 100
for k, v in outcomes.items():
    print('Article {}: {} {}'.format(k, v['total'], '(removed)' if v['total'] < 100 else ''))
outcomes = {k:v for k,v in outcomes.items() if v['total'] > 100}

# generate case info for specific articles
df_lst = []
for k in outcomes.keys():
    ## store it in dataframe
    df = pd.DataFrame(cases_per_articles[k])
    ### drop duplicated cases
    df.drop_duplicates(subset = ['itemid'], inplace = True)
    df_lst.append(df) 

Article 10: 669 
Article p1-1: 1363 
Article 14: 432 
Article 6: 7340 
Article 13: 1865 
Article 5: 2370 
Article 3: 2330 
Article 9: 96 (removed)
Article 8: 1375 
Article 18: 36 (removed)
Article 34: 169 
Article 2: 809 
Article p1-3: 64 (removed)
Article p12-1: 5 (removed)
Article 11: 274 
Article p7-4: 31 (removed)
Article p1-2: 27 (removed)
Article p4-2: 49 (removed)
Article 25: 7 (removed)
Article 38: 41 (removed)
Article 7: 88 (removed)
Article p7-2: 14 (removed)
Article 4: 17 (removed)
Article 12: 19 (removed)
Article p7-3: 1 (removed)
Article p4-4: 9 (removed)
Article p7-1: 5 (removed)
Article 46: 1 (removed)
Article p6-1: 4 (removed)
Article .5: 1 (removed)
Article 17: 1 (removed)


In [284]:
# export dataframe for each article as a csv
## make a folder to save
os.mkdir('./case_info')
## article list
art_lst = list(outcomes.keys())
for i in range(len(df_lst)):
    df_lst[i].to_csv('case_info/case_info_{}.csv'.format(art_lst[i]), index = False)
    df_lst[i]['conclusion'] = df_lst[i]['conclusion'].astype(str) # as postgresql cannot accept 'dict' type

In [11]:
# dataframe for all cases before spliting into specific articles 
df = pd.DataFrame(all_cases)
df.to_csv('case_info/case_info.csv', index = False)
df.conclusion = df.conclusion.astype(str) # as postgresql cannot accept 'dict' type

### 5. Storing in PostgreSQL database 

- store case information data frame for each article 
- store all articles data frame 


In [None]:
# connect to database 
engine = create_engine("postgresql://postgres:xfkLVeMj@localhost/hudoc")
con = engine.connect()

# create table for all cases 
table_name = 'case_info'
df.to_sql(table_name, con)
print(engine.table_names())

# create table for each articles
table_name_lst = ['case_info_{}'.format(a) for a in art_lst]
for i in range(len(df_lst)):
    df_lst[i].to_sql(table_name_lst[i], con)   
print(engine.table_names())

con.close()