# Case Law Citations Extractor for EUR-LEX
Extracts case citations for all cases in EUR-LEX. This is done on the level-of-detail of the individual paragraph cited. For example, if we are extracting citations for case 62011CJ0488, then the citation 62010CJ0618: N 31 38 - 43 49 57 58 will be decomposed into the individual citations: 62010CJ0618: N31, 62010CJ0618: N38, 62010CJ0618: N39, 62010CJ0618: N40, 62010CJ0618: N41, 62010CJ0618: N42, 62010CJ0618: N43, 62010CJ0618: N49, 62010CJ0618: N57 and 62010CJ0618: N58

## Define main functions used in this notebook

### Functions: Part 1
Low-level functions for actually extracting metadata of each type for the given source case

In [76]:
# Urllib library used to query a website
from urllib.request import urlopen
# BeautifulSoup webscraping module for python
from bs4 import BeautifulSoup
# CSV parser
import csv
# Regular expressions
import re

#s = "123123STRINGabcabc"

def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

def find_between_r( s, first, last ):
    try:
        start = s.rindex( first ) + len( first )
        end = s.rindex( last, start )
        return s[start:end]
    except ValueError:
        return ""


#print find_between( s, "123", "abc" ) -> 123STRING
#print find_between_r( s, "123", "abc" ) -> STRINGabc
        
def processProcedure(piece_of_text):
    #print(piece_of_text)
    # initialise list of items to be extracted from Procedure section
    items = []
    lines = piece_of_text.split('\n')
    lines2 = [x for x in lines if x]
    lines3 = []
    for thing in lines2:
        if (thing != '' and thing != ' '):
            lines3.append(thing)
            
    #print(lines3)
    
    judge = ''
    advocate = ''
    
    for item in lines3:
        
        line_split = item.split(': ')
        #print(line_split[0])
        if (((line_split[0].upper().count('JUDGE') > 0) or (line_split[0].upper().count('RAPPORTEUR') > 0)) and (judge == '')):
            judge = line_split[1]
        elif ((line_split[0].upper().count('ADVOCATE') > 0) and (advocate == '')):
            advocate = line_split[1]
            
    items.append(judge)
    items.append(advocate)
    
    #print(items)
    return items    
            
def processTitle(piece_of_text):
    #print("text: " + piece_of_text)
    # initialise list of items to be extracted from Title section
    items = []
    lines = piece_of_text.split('\n')
    lines2 = [x for x in lines if x]
    lines3 = []
    for thing in lines2:
        if (thing != '' and thing != ' '):
            lines3.append(thing)
            
    #print(lines3)
    
    line_split = lines3[0].split('.')
    line_split2 = [x for x in line_split if x]
    line_split3 = []
    for thing in line_split2:
        if (thing != '' and thing != ' '):
            line_split3.append(thing)
    
    #print(line_split3)
    
    ruling_title = line_split3[0]
    chamber = find_between_r(line_split3[0], '(', ')')
    ruling_name = line_split3[1]
    
    items.append(ruling_title)
    items.append(chamber)
    items.append(ruling_name)
    
    if (len(line_split3) == 5):
        for k in range(2, len(line_split3)-1):
            items.append(line_split3[k])
    else:
        items.append('Check EUR-LEX webpage')
        items.append('Check EUR-LEX webpage')
        
    case_label = line_split3[len(line_split3)-1]
    items.append(case_label)
    ecli = lines3[len(lines3)-1]
    items.append(ecli)
        
    #print(items)
    return items       
        
        
def processMisc(piece_of_text):
    # initialise list of items to be extracted from Miscellaneous section (Country)
    items = []
    lines = piece_of_text.split('\n')
    lines2 = [x for x in lines if x]
    lines3 = []
    for thing in lines2:
        if (thing != '' and thing != ' '):
            lines3.append(thing)
            
    country = ''
    
    for item in lines3:
        line_split = item.split(': ')
        if (line_split[0].upper().count('COUNTRY') > 0):
            country = line_split[1]
            
    items.append(country)
    
    #print(items)
    return items    
        
        
def processDates(piece_of_text):
    # initialise list of items to be extracted from Dates section (lodged and document dates)
    items = []
    lines = piece_of_text.split('\n')
    lines2 = [x for x in lines if x]
    dates = []
    for thing in lines2:
        if (thing != '' and thing != ' '):
            dates.append(thing)
            
    lodge_date = ''
    doc_date = ''
    
    for item in dates:
        date_split = item.split(': ')
        if (date_split[0].upper().count('LODGED') > 0):
            lodge_date = date_split[1]
        else:
            doc_date = date_split[1]
            
    items.append(lodge_date)
    items.append(doc_date)
    
    #print(items)
    return items

### Functions: Part 2
1) Low-level function for actually extracting the citations for a given source case, 2) function for extracting other subject matters related to a case, and 3) function to write data to file

In [77]:
# Urllib library used to query a website
from urllib.request import urlopen
# BeautifulSoup webscraping module for python
from bs4 import BeautifulSoup
# CSV parser
import csv
# Regular expressions
import re

# Write data (citations, metadata or subjects) to file
def writeToFile(rows, datatype):
    with open('../data/orders/'+datatype+'/orders_'+datatype+'.csv', 'a', newline='', encoding='utf-8') as csvfile:
        # Open file for writing
        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        # Write each data row to file
        # Check if any element of list is also a list
        if (any(isinstance(el, list) for el in rows)):
            for row in rows:
                writer.writerow(row)
        else:
            writer.writerow(rows)

### Functions: Part 3
Main calling functions for extracting the citations and metadata for a given source case 

In [78]:
def findSectionType(result):
    results2 = result.find_all("div")
    for result2 in results2:
        if result2.get('class') is not None:
            if ((result2['class'][0]).count('boxTitle') == 1):
                return result2.text
    return ''

def removeFirstAndLastHTMLTag(text):
    if (text[0] == '<' and text[len(text)-1] == '>'):
        endBracketPos = text.find('>')
        if (endBracketPos > 0 and endBracketPos < len(text) - 1):
            newText = text[1:]
            beginBracketPos = newText.find('<')
            return text[endBracketPos+1: (beginBracketPos+1)-len(text)]
        else:
            return text
    else:
        return text

# Extract Metadata for case given the BeautifulSoup format of it's HTML page
def extractMetadata(soup_judgement_page, celexNumber):
    # find all divs of class 'box'
    div_results = soup_judgement_page.find_all("div", {"class": "box"})
    # sections
    sections = []
    # datarow to write to file
    datarow = []
    datarow.append(celexNumber)
    # for each div of class 'box'
    for result in div_results:
        #print(result)
        results2 = result.find_all("div")
        index = 0
        sectionType = findSectionType(result)
        sectionType = sectionType.replace(" ", "")
        #print("sectionType: " + str(sectionType))
        if (sectionType.count("Titleandreference") > 0) or (sectionType.count("Dates") > 0) or (sectionType.count("Procedure") > 0) or (sectionType.count("Classifications") > 0) or (sectionType.count("Miscellaneousinformation") > 0):
            #print("sectionType2: " + sectionType)
            # for each div inside 
            for result2 in results2:
                #print(result2)
                if result2.get('id') is not None:
                    if (result2['id'] == "originalTitle"):
                        if (sectionType.count("Titleandreference") > 0):
                            #result3 = result2.find_all("div")
                            tmp = result2.contents[1]
                            tmpStr = str(tmp).replace("<strong>","")
                            tmpStr = tmpStr.replace("</strong>", "")
                            tmpStr = tmpStr.replace("\n", "")
                            #print("before: " + str(tmp))
                            #print("after: " + tmpStr)
                            title_pieces = tmpStr.split("<br/>")
                            title_pieces = list(filter(None, title_pieces))
                            #print(title_pieces)
                            # 1. Title and reference
                            # Chamber, ruling name, ruling content, case label
                            #print(result2.text)
                            datarow.extend(title_pieces)
                            #title = processTitle(result2)
                            #datarow.extend(title)
                            
                        elif (sectionType.count("Dates") > 0):
                            # 2. Dates
                            # Date document, date lodged
                            #print(result2.text)
                            dates = processDates(result2.text)
                            datarow.extend(dates)
                            
                        elif (sectionType.count("Classifications") > 0):
                            #print(result2.text)
                            # 3. Classifications
                            # 3a. Subject matters
                            print()
                            #processClassifications(subjectMatterCode, result2.text, celexNumber)
                            
                        elif (sectionType.count("Miscellaneousinformation") > 0):
                            # 4. Misc
                            # Country
                            misc = processMisc(result2.text)
                            datarow.extend(misc)
                            
                        elif (sectionType.count("Procedure") > 0):
                            # 5. Procedure
                            # Judge-Rapporteur, Advocate General
                            procedure = processProcedure(result2.text)
                            datarow.extend(procedure)
                            
#                 if result2.get('class') is not None:
#                     if ((result2['class'][0]).count('tabContent') == 1):
#                         if (sectionType.count("Titleandreference") > 0):
#                             # 1. Title and reference
#                             # Chamber, ruling name, ruling content, case label
#                             #print(result2.text)
#                             title = processTitle(result2.text)
#                             datarow.extend(title)
                            
#                         elif (sectionType.count("Dates") > 0):
#                             # 2. Dates
#                             # Date document, date lodged
#                             #print(result2.text)
#                             dates = processDates(result2.text)
#                             datarow.extend(dates)
                            
#                         elif (sectionType.count("Classifications") > 0):
#                             #print(result2.text)
#                             # 3. Classifications
#                             # 3a. Subject matters
#                             print()
#                             #processClassifications(subjectMatterCode, result2.text, celexNumber)
                            
#                         elif (sectionType.count("Miscellaneousinformation") > 0):
#                             # 4. Misc
#                             # Country
#                             misc = processMisc(result2.text)
#                             datarow.extend(misc)
                            
#                         elif (sectionType.count("Procedure") > 0):
#                             # 5. Procedure
#                             # Judge-Rapporteur, Advocate General
#                             procedure = processProcedure(result2.text)
#                             datarow.extend(procedure)
    
    # Clean datarow items of all commas within each item
    cleaned_row = []
    for item in datarow:
        tmp = item.replace(',', ';')
        cleaned_row.append(tmp)
        
    #cleaned_row.append(subjectMatterCode)
    #print(cleaned_row)
    #Write metadata row for source case to file
    writeToFile(cleaned_row, 'metadata')


### Functions: Part 4
Functions for processing a single case and processing all cases within a main case subject

In [79]:
from math import ceil
import json

# Process individual case given CELEX number
def processCase(celexNumber):
    print("Source: " + str(celexNumber))
    #print('-------------------------')
    # URL prefix for a judgement on EUR-LEX
    result_url_prefix = "https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:"
    # URL for this particular judgement
    judgement_url = result_url_prefix + str(celexNumber)
    # Open the page
    judgement_page = urlopen(judgement_url)
    # Convert it to BeautifulSoup format 
    soup_judgement_page = BeautifulSoup(judgement_page, "lxml")
    # Get Metadata
    extractMetadata(soup_judgement_page, celexNumber)
    # Get citations
    #extractCitations(subjectMatterCode, soup_judgement_page, celexNumber)
    #print()

 # Process cases for a particular subject matter code
def processCases():
    # URL prefix and suffix for judgements search results (url = prefix_url + subject matter code + suffix_url)
    prefix_url = "https://eur-lex.europa.eu/search.html?searchEq=true&qid=1535705893791&DB_TYPE_OF_ACT=judgment&CASE_LAW_SUMMARY=false&DTS_DOM=EU_LAW&typeOfActStatus=JUDGMENT&type=advanced&lang=en&SUBDOM_INIT=EU_CASE_LAW&DTS_SUBDOM=EU_CASE_LAW"
    #suffix_url = "&typeOfActStatus=JUDGMENT&type=advanced&lang=en&SUBDOM_INIT=EU_CASE_LAW&DTS_SUBDOM=EU_CASE_LAW"
    # Get the URL for all cases about this subject matter code
    url = prefix_url
    # + subjectMatterCode + suffix_url
    # Open the URL
    url_page = urlopen(url)
    # Parse the HTML in the page, and store them in Beautiful Soup format using the 'lxml' parser
    soup_url_page = BeautifulSoup(url_page, "lxml")
    # The search results page displays 10 results at a time. 
    # Get the total number of 10-result pages for this subject matter.    
    j_onsubmit = soup_url_page.find('form', id='pagingForm').get('onsubmit')
    j_onsubmit = j_onsubmit.replace(" ", "")
    j_number = 1
    if (j_onsubmit):
        j_numberStr = j_onsubmit.split(",")[1] 
        j_numberStr = j_numberStr.replace(")", "");
        j_numberStr = j_numberStr.replace(" ", "");
        # Final number of result pages
        j_number = int(j_numberStr)
    else:
        j_scripts = soup_url_page.find_all('script', type='application/json')
        for script in j_scripts:
            print("script: " + script.text)
            script_text = json.loads(script.text)
            if 'search' in script_text:
                if (script_text['search']):
                    tmp = script_text['search']
                    num = tmp['count']
                    print("yay!: " + str(num))
                    if ((num/10) > 1):
                        j_number = ceil(num/10)
                
    
    print()
    print("total pages: " + str(j_number))
    print()
    # Get each result item tag (thats where the metadata for each case is found - including the CELEX number)
    j_results_on_first_page = soup_url_page.find_all('td', class_='leftMetadata')
            
    print()
    print("Page 1/" + str(j_number))
    print()
    # Process the 10 cases on the first page of the results
    for result in j_results_on_first_page:
        for ul in result.find_all('ul'):        
            for li in ul.find_all('li'):
                if (li.text[:13] == "CELEX number:"):
                    celex = li.text[14:]
                    if (celex[0] == '6' and celex[5] == 'C') and (celex[6] == 'J' or celex[6] == 'O'):
                        processCase(subjectMatterCode, celex)
    
    if (j_number > 1):
        # Process the other cases from Page 2 of results onwards
        for x in range(2, j_number+1):
            print()
            print("Page " + str(x) + "/" + str(j_number))
            print()
            # Get URL of Page x of results
            current_judgements_result_page_url = url + '&page=' + str(x) 
            # Open URL of Page x of results
            current_judgements_result_page = urlopen(current_judgements_result_page_url)
            # Store the HTML form of this page in BeautifulSoup format
            soup_current_judgements_page = BeautifulSoup(current_judgements_result_page, "lxml")
            # Get each result item tag (thats where the metadata for each case is found - including the CELEX number)
            results_on_page_x = soup_current_judgements_page.find_all('td', class_='leftMetadata')
            #Find the 10 cases on this page
            for result in results_on_page_x:
                for ul in result.find_all('ul'):
                    for li in ul.find_all('li'):
                        if (li.text[:13] == "CELEX number:"):
                            celex = li.text[14:]
                            if (celex[0] == '6' and celex[5] == 'C') and (celex[6] == 'J' or celex[6] == 'O'):
                                processCase(subjectMatterCode, celex)
    
    
    

### Main Procedure: Get the list of EUR-LEX subject matter codes from file and process the cases for each
Note: we have a list of all the EUR-LEX subject matters in "../data/SubjectMatterCodes.tsv" 

In [80]:
# CSV parser
import csv

#processCase("62001CJ0164")

# Arrays for celex numbers
judgementsCelexNumbers = []
ordersCelexNumbers = []

# Import celex numbers from judgements metadata CSV to array
with open('../data/judgements/judgement_metadata.csv', encoding="utf8") as tsvfile:
    reader = csv.reader(tsvfile, delimiter=',')
    for row in reader:
        judgementsCelexNumbers.append(row[0])

# Import celex numbers from orders metadata CSV to array
with open('../data/orders/order_metadata.csv', encoding="utf8") as tsvfile:
    reader = csv.reader(tsvfile, delimiter=',')
    for row in reader:
        ordersCelexNumbers.append(row[0])
        
# print(len(judgementsCelexNumbers))
# print(len(ordersCelexNumbers))
# print(judgementsCelexNumbers)
# print(ordersCelexNumbers)

judgementsCelexNumbers.pop(0)
ordersCelexNumbers.pop(0)

index = 0
length = len(ordersCelexNumbers)
# For each celex number in the array
for celexNumber in ordersCelexNumbers:
    if (index+1 >= 1):
        print(str(index+1) + "/" + str(length))
        processCase(celexNumber)
    index = index + 1
    

#     print(subjectMatterCode)
#     processCases(subjectMatterCode)
#     print()


1/2694
Source: 62010CO0028
2/2694
Source: 62007CO0023
3/2694
Source: 62001CO0321
4/2694
Source: 62002CO0296
5/2694
Source: 62000CO0030
6/2694
Source: 61993CO0107(01)
7/2694
Source: 61993CO0107
8/2694
Source: 61986CO0117(01)
9/2694
Source: 61986CO0128
10/2694
Source: 61986CO0117
11/2694
Source: 61986CO0119
12/2694
Source: 61986CO0055
13/2694
Source: 61983CO0114
14/2694
Source: 61981CO0041
15/2694
Source: 62012CO0546(01)
16/2694
Source: 62016CO0176
17/2694
Source: 62016CO0446
18/2694
Source: 62016CO0280
19/2694
Source: 62012CO0546
20/2694
Source: 62016CO0134(01)
21/2694
Source: 62015CO0462
22/2694
Source: 62015CO0227
23/2694
Source: 62015CO0152
24/2694
Source: 62014CO0519
25/2694
Source: 62014CO0517
26/2694
Source: 62014CO0164
27/2694
Source: 62014CO0070
28/2694
Source: 62013CO0304
29/2694
Source: 62013CO0071
30/2694
Source: 62013CO0643
31/2694
Source: 62012CO0552
32/2694
Source: 62012CO0248
33/2694
Source: 62013CO0024
34/2694
Source: 62013CO0167
35/2694
Source: 62009CO0038
36/2694
Sourc