In [None]:
import os
os.chdir("D:\DSS D-schijf\Thesis")

# Functions

## Read csv

In [None]:
#ADAPTED FROM THE rechtspraak_extractor package of Maastricht University 

import requests, glob, time
from pathlib import Path
import pandas as pd


# Check whether the API is working or not and return with the response code
def check_api(url):
    response = requests.get(f"{url}")

    # Return with the response code
    return response.status_code


# Reads all the CSV files in a folder and returns the list of files
# It also has an optional parameter "exclude". By default, it's None. If you want to exclude files having a certain
# word in the file name, you may give a value
# It also only grabs data if it has rechtspraak in it
# As that was causing issues with other csv data present
def read_csv(dir_name, exclude=None):
    path = dir_name
    csv_files = glob.glob(path + "/*.csv")
    files = []
    for i in csv_files:
        if exclude is not None:
            if exclude not in i and "rechtspraak" in i:
                files.append(i)
        else:
            if "rechtspraak" in i:
                files.append(i)

    print("Found " + str(len(files)) + " CSV file(s)\n")
    return files


# Get total execution time
def get_exe_time(start_time):
    end_time = time.time()
    sec = end_time - start_time
    mins = sec // 60
    sec = sec % 60
    hours = mins // 60
    mins = mins % 60
    print("Total execution time: {0}:{1}:{2}".format(int(hours), int(mins), round(sec, 2)))
    print("\n")

## Get rechtspraak  

In [None]:
#ADAPTED FROM THE rechtspraak_extractor package of Maastricht University

# This file is used to get all the Rechtspraak ECLIs from an API.
# It takes two required arguments and one optional argument
# 1. max - Maximum number of ECLIs to retrieve
# 2. starting-date (yyyy-mm-dd) - Start date of ECLI publication
# 3. ending-date (yyyy-mm-dd) - It's an optional parameter. If not given, current date will be automatically chosen
# File is stored in data/rechtspraak folder

import json
import xmltodict
import os
from datetime import date, datetime


# Define base URL
RECHTSPRAAK_API_BASE_URL = "https://data.rechtspraak.nl/uitspraken/zoeken?"


def get_data_from_url(url):
    res = requests.get(url)
    res.raw.decode_content = True

    # Convert the XML data to JSON format
    xpars = xmltodict.parse(res.text)
    json_string = json.dumps(xpars)
    json_object = json.loads(json_string)

    # Get the JSON object from a specific branch
    json_object = json_object['feed']['entry']

    return json_object


def save_csv(json_object, file_name, save_file):
    # Define the dataframe to enter the data
    df = pd.DataFrame(columns=['id', 'title', 'summary', 'updated', 'link'])
    ecli_id = []
    title = []
    summary = []
    updated = []
    link = []

    # Iterate over the object and fill the lists
    for i in json_object:
        ecli_id.append(i['id'])
        title.append(i['title']['#text'])
        if '#text' in i['summary']:
            summary.append(i['summary']['#text'])
        else:
            summary.append("No summary available")
        updated.append(i['updated'])
        link.append(i['link']['@href'])

    # Save the lists to dataframe
    df['id'] = ecli_id
    df['title'] = title
    df['summary'] = summary
    df['updated'] = updated
    df['link'] = link

    if save_file == 'y':
        # Create directory if not exists
        Path('data').mkdir(parents=True, exist_ok=True)

        # Save CSV file
        # file_path = os.path.join('data', file_name + '.csv')
        df.to_csv('data/' + file_name + '.csv', index=False, encoding='utf8')
        print("Data saved to CSV file successfully.")
    return df

def get_rechtspraak(max_ecli=100, sd='1900-01-01', ed=None, save_file='y', from_value=0, instantie='', rechtsgebied=''):
    print("Rechtspraak dump downloader API")

    amount = max_ecli
    starting_date = sd
    save_file = save_file

    # If the end date is not entered, the current date is taken
    today = date.today()
    if ed:
        ending_date = ed
    else:
        ending_date = today.strftime("%Y-%m-%d")

    # Used to calculate total execution time
    start_time = time.time()

    # Build the URL after getting all the arguments
    url = RECHTSPRAAK_API_BASE_URL + 'max=' + str(amount) + '&date=' + starting_date + '&date=' + ending_date + '&from=' +str(from_value)+ '&return=DOC' + '&creator=' + instantie

    print("Checking the API")
    # Check the working of API
    response_code = check_api(url)
    if response_code == 200:
        print("API is working fine!")
        print("Getting " + str(amount) + " documents from " + starting_date + " till " + ending_date)

        json_object = get_data_from_url(url)
        print(f"Found {len(json_object)} cases!")
        if json_object:
            # Get current time
            current_time = datetime.now().strftime("%H-%M-%S")

            # Build file name
            file_name = 'rechtspraak_' + starting_date + '_' + ending_date + '_' + current_time


            get_exe_time(start_time)

            if save_file == 'n':
                #global_rs_df = save_csv(json_object, file_name, save_file)
                return  json_object #global_rs_df
            else:
                save_csv(json_object, file_name, save_file)
                return
    else:
        print(f"URL returned with a {response_code} error code")

## Get metadata


In [None]:
#ADAPTED FROM THE rechtspraak_extractor package of Maastricht University

# This file is used for getting the metadata of the ECLIs obtained using rechspraak_api file. This file takes all the
# CSV file created by rechspraak_api, picks up ECLIs and links column, and using an API gets the metadata and saves it
# in another CSV file with metadata suffix.
# This happens in async manner.
import pathlib
import os
import urllib
import multiprocessing
from bs4 import BeautifulSoup
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import platform
import shutil
from tqdm import tqdm
from functools import partial
# Define base url
RECHTSPRAAK_METADATA_API_BASE_URL = "http://data.rechtspraak.nl/uitspraken/content?id=" # old one = "https://uitspraken.rechtspraak.nl/#!/details?id="
return_type = "&return=DOC"

# Define empty lists where we'll store our data temporarily
ecli_df = []
full_text_df = []
creator_df = []
date_decision_df = []
issued_df = []
zaaknummer_df = []
type_df = []
relations_df = []
references_df = []
subject_df = []
procedure_df = []
inhoudsindicatie_df = []
hasVersion_df = []

threads = []
max_workers = 0


def get_cores():
    # max_workers is the number of concurrent processes supported by your CPU multiplied by 5.
    # You can change it as per the computing power.
    # Different python versions treat this differently. This is written as per python 3.6.
    n_cores = multiprocessing.cpu_count()

    global max_workers
    max_workers = n_cores-1
    # If the main process is computationally intensive: Set to the number of logical CPU cores minus one.

    print(f"Maximum " + str(max_workers) + " threads supported by your machine.")


def extract_data_from_xml(url):
    with urllib.request.urlopen(url) as response:
        xml_file = response.read()
        return xml_file



def check_if_df_empty(df):
    if df.empty:
        return True
    return False


def get_text_if_exists(el):
    try:
        return el.text
    except:
        return ''

def update_bar(bar, *args):
    bar.update(1)


def save_data_when_crashed(ecli):
    ecli_df.append(ecli)
    full_text_df.append("")
    creator_df.append("")
    date_decision_df.append("")
    issued_df.append("")
    zaaknummer_df.append("")
    type_df.append("")
    relations_df.append("")
    references_df.append("")
    subject_df.append("")
    procedure_df.append("")
    inhoudsindicatie_df.append("")
    hasVersion_df.append("")
def get_data_from_api(ecli_id):
    url = RECHTSPRAAK_METADATA_API_BASE_URL + ecli_id + return_type
    try:
        response_code = check_api(url)
    except:
        save_data_when_crashed(ecli_id)
        return
    global ecli_df, full_text_df, creator_df, date_decision_df, issued_df, zaaknummer_df, type_df, \
        relations_df, references_df, subject_df, procedure_df, inhoudsindicatie_df, hasVersion_df
    try:
        if response_code == 200:
            try:
                # Extract data from xml
                xml_object = extract_data_from_xml(url)
                soup = BeautifulSoup(xml_object, features='xml')
                # Get the data
                creator = get_text_if_exists(soup.find("dcterms:creator"))
                date_decision = get_text_if_exists(soup.find("dcterms:date"))
                issued = get_text_if_exists(soup.find("dcterms:issued"))
                zaaknummer = get_text_if_exists(soup.find("psi:zaaknummer"))
                rs_type = get_text_if_exists(soup.find("dcterms:type"))
                subject = get_text_if_exists(soup.find("dcterms:subject"))
                relation = soup.findAll("dcterms:relation")
                relatie = ''
                for i in relation:
                    # append the string to relation
                    text = get_text_if_exists(i)
                    if text == '':
                        continue
                    else:
                        relatie += text + "\n"
                relations = relatie
                reference = soup.findAll("dcterms:references")
                ref = ''
                for u in reference:
                    text = get_text_if_exists(u)
                    # append the string to relation
                    if text =="":
                        continue
                    else:
                        ref += text + "\n"
                references = ref
                procedure = get_text_if_exists(soup.find("psi:procedure"))
                inhoudsindicatie = get_text_if_exists(soup.find("inhoudsindicatie"))
                hasVersion = get_text_if_exists(soup.find("dcterms:hasVersion"))
                full_text = get_text_if_exists(soup.find("uitspraak"))

                ecli_df.append(ecli_id)
                print(ecli_id)
                full_text_df.append(full_text)
                creator_df.append(creator)
                date_decision_df.append(date_decision)
                issued_df.append(issued)
                zaaknummer_df.append(zaaknummer)
                type_df.append(rs_type)
                relations_df.append(relations)
                references_df.append(references)
                subject_df.append(subject)
                procedure_df.append(procedure)
                inhoudsindicatie_df.append(inhoudsindicatie)
                hasVersion_df.append(hasVersion)
                del full_text, creator, date_decision, issued, zaaknummer,relations, rs_type,\
                    references, subject,procedure, inhoudsindicatie, hasVersion

                urllib.request.urlcleanup()

            except Exception as e:
                save_data_when_crashed(ecli_id)
        else:
            save_data_when_crashed(ecli_id)
    except Exception as e:
        save_data_when_crashed(ecli_id)


def get_rechtspraak_metadata(save_file='n', dataframe=None, filename=None):
    if dataframe is not None and filename is not None:
        print(f"Please provide either a dataframe or a filename, but not both")
        return False

    if dataframe is None and filename is None and save_file == 'n':
        print(f"Please provide at least a dataframe of filename when the save_file is \"n\"")
        return False

    print("Rechtspraak metadata API")

    start_time = time.time()  # Get start time

    no_of_rows = ''
    rs_data = ''
    csv_files = 0

    # Check if dataframe is provided and is correct
    if dataframe is not None:
        if 'id' in dataframe and 'link' in dataframe:
            rs_data = dataframe
            no_of_rows = rs_data.shape[0]
        else:
            print("Dataframe is corrupted or does not contain necessary information to get the metadata.")
            return False

    # Check if filename is provided and is correct
    if filename is not None:
        print("Reading " + filename + " from data folder")
        file_check = pathlib.Path("data/" + filename)
        if file_check.is_file():
            print("File found. Checking if metadata already exists")
            # Check if metadata already exists
            file_check = Path("data/" + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4]
                              + "_metadata.csv")
            if file_check.is_file():
                print("Metadata for " + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] +
                      ".csv already exists.")
                return False
            else:
                rs_data = pd.read_csv('data/' + filename)
                if 'id' in rs_data and 'link' in rs_data:
                    no_of_rows = rs_data.shape[0]
                else:
                    print("File is corrupted or does not contain necessary information to get the metadata.")
                    return False
        else:
            print("File not found. Please check the file name.")
            return False

    get_cores()  # Get number of cores supported by the CPU

    if dataframe is None and filename is None and save_file == 'y':
        print("No dataframe or file name is provided. Getting the metadata of all the files present in the "
              "data folder")

        print("Reading all CSV files in the data folder...")
        csv_files = read_csv('data', "metadata")

        global ecli_df, full_text_df, creator_df, date_decision_df, issued_df, zaaknummer_df, \
           type_df, relations_df,references_df, subject_df,\
           procedure_df, inhoudsindicatie_df, hasVersion_df
        if len(csv_files) > 0 and save_file == 'y':
            for f in csv_files:
                # Create empty dataframe
                rsm_df = pd.DataFrame(columns=['ecli', 'full_text', 'creator', 'date_decision',
                                               'issued', 'zaaknummer','type',"relations",
                                                'references','subject','procedure',
                                                'inhoudsindicatie', 'hasVersion'])

                temp_file_name = f.split('\\')[-1][:len(f.split('\\')[-1]) - 4]

                # Check if file already exists
                file_check = Path("data/" + temp_file_name + "_metadata.csv")
                if file_check.is_file():
                    print("Metadata for " + temp_file_name + ".csv already exists.")
                    continue

                df = pd.read_csv(f)
                no_of_rows = df.shape[0]
                print("Getting metadata of " + str(no_of_rows) + " ECLIs from " + temp_file_name + ".csv")
                print("Working. Please wait...")

                # Get all ECLIs in a list
                ecli_list = list(df.loc[:, 'id'])

                # Create a temporary directory to save files
                time.sleep(1)
                Path('temp_rs_data').mkdir(parents=True, exist_ok=True)
                with ThreadPoolExecutor(max_workers=max_workers) as executor:
                    for ecli in ecli_list:
                        threads.append(executor.submit(get_data_from_api, ecli))

                # Delete temporary directory
                shutil.rmtree('temp_rs_data')
                # executor.shutdown()  # Shutdown the executor

                rsm_df['ecli'] = ecli_df
                rsm_df['full_text'] = full_text_df
                rsm_df['creator'] = creator_df
                rsm_df['date_decision'] = date_decision_df
                rsm_df['issued'] = issued_df
                rsm_df['zaaknummer'] = zaaknummer_df
                rsm_df['type'] = type_df
                rsm_df['relations'] = relations_df
                rsm_df['references'] = references_df
                rsm_df['subject'] = subject_df
                rsm_df['procedure'] = procedure_df
                rsm_df['inhoudsindicatie'] = inhoudsindicatie_df
                rsm_df['hasVersion'] = hasVersion_df
                addition = rs_data[['id', 'summary']]
                rsm_df = rsm_df.merge(addition, how='left', left_on='ecli', right_on='id').drop(['id'], axis=1)
                # Create directory if not exists
                Path('data').mkdir(parents=True, exist_ok=True)

                if check_if_df_empty(rsm_df):
                    print("Metadata not found. Please check the API response; either API is under maintenance, "
                          "experiencing problems, or has changed. Please try again after some time or contact the "
                          "administrator.\n")
                else:
                    # Save CSV file
                    print("Creating CSV file...")
                    rsm_df.to_csv("data/" + temp_file_name + "_metadata.csv", index=False, encoding='utf8')
                    print("CSV file " + temp_file_name + "_metadata.csv  successfully created.\n")

                # Clear the lists for the next file
                ecli_df = []
                full_text_df = []
                creator_df = []
                date_decision_df = []
                issued_df = []
                zaaknummer_df = []
                type_df = []
                relations_df = []
                references_df = []
                subject_df = []
                procedure_df = []
                inhoudsindicatie_df = []
                hasVersion_df = []
                ecli_list = []
                del rsm_df
            return True

    if rs_data is not None:
        rsm_df = pd.DataFrame(columns=['ecli', 'full_text', 'creator', 'date_decision', 'issued',
                                       'zaaknummer','type','relations','references', 'subject', 'procedure',
                                        'inhoudsindicatie','hasVersion'])

        print("Getting metadata of " + str(no_of_rows) + " ECLIs")
        print("Working. Please wait...")
        # Get all ECLIs in a list
        ecli_list = list(rs_data.loc[:, 'id'])

        # Create a temporary directory to save files
        Path('temp_rs_data').mkdir(parents=True, exist_ok=True)
        time.sleep(1)
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            bar = tqdm(total=len(ecli_list), colour="GREEN",position=0, leave=True, miniters=int(len(ecli_list)/100),
                       maxinterval=10000)
            for ecli in ecli_list:
                threads.append(executor.submit(get_data_from_api, ecli))
            for t in threads:
                t.add_done_callback(partial(update_bar,bar))
        # Delete temporary directory
        shutil.rmtree('temp_rs_data')
         # to finish unfinished?
        # global ecli_df, full_text_df, creator_df, date_decision_df, issued_df, zaaknummer_df, \
        #    relations_df, subject_df, procedure_df, inhoudsindicatie_df, hasVersion_df

        rsm_df['ecli'] = ecli_df
        rsm_df['full_text'] = full_text_df
        rsm_df['creator'] = creator_df
        rsm_df['date_decision'] = date_decision_df
        rsm_df['issued'] = issued_df
        rsm_df['zaaknummer'] = zaaknummer_df
        rsm_df['type'] = type_df
        rsm_df['relations'] = relations_df
        rsm_df['references'] = references_df
        rsm_df['subject'] = subject_df
        rsm_df['procedure'] = procedure_df
        rsm_df['inhoudsindicatie'] = inhoudsindicatie_df
        rsm_df['hasVersion'] = hasVersion_df
        addition = rs_data[['id','summary']]
        rsm_df = rsm_df.merge(addition, how='left', left_on='ecli', right_on='id').drop(['id'], axis=1)
        if save_file == 'y':
            if filename is None or filename == '':
                filename = "custom_rechtspraak_" + datetime.now().strftime("%H-%M-%S") + ".csv"
            # Create directory if not exists
            Path('data').mkdir(parents=True, exist_ok=True)

            if check_if_df_empty(rsm_df):
                print("Metadata not found. Please check the API response; either API is under maintenance, "
                      "experiencing problems, or has changed. Please try again after some time or contact the "
                      "administrator.\n")
            else:
                # Save CSV file
                print("Creating CSV file...")
                rsm_df.to_csv("data/" + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] + "_metadata.csv",
                              index=False, encoding='utf8')
                print("CSV file " + filename.split('/')[-1][:len(filename.split('/')[-1]) - 4] + "_metadata.csv" +
                      " successfully created.\n")

        # Clear the lists for the next file
        ecli_df = []
        full_text_df = []
        creator_df = []
        date_decision_df = []
        issued_df = []
        zaaknummer_df = []
        type_df = []
        relations_df = []
        references_df = []
        subject_df = []
        procedure_df = []
        inhoudsindicatie_df = []
        hasVersion_df = []
        ecli_list = []

        get_exe_time(start_time)

        if save_file == 'n':
            return rsm_df

        return True


## Get citations

In [None]:
#ADAPTED FROM THE rechtspraak_citation_extractor package of Maastricht University

import requests
from lxml import etree
import urllib.request
import rdflib
import threading
import json
import pandas as pd
from dotenv import load_dotenv
from requests.auth import HTTPBasicAuth
from tqdm import tqdm
load_dotenv()

LIDO_ENDPOINT = "http://linkeddata.overheid.nl/service/get-links"

target_ecli = 'target_ecli'
label = 'label'
type = 'type'
ecli = 'ecli'
case_citations_fieldnames = [target_ecli, label, type]
legislation_citations_fieldnames = ['legal_provision_url_lido', 'legal_provision_url', 'legal_provision']


def remove_spaces_from_ecli(ecli):
    return ecli.replace(" ", "")


def write_incremental_rows(filename, data):
    with open(filename, 'a') as f:
        pd.DataFrame(data).to_csv(f, mode='a', header=not f.tell(), index=False)


# Code to execute LIDO API call
def get_lido_response(url, username, password):
    authentication = HTTPBasicAuth(username, password)
    response = requests.get(url, auth=authentication)
    if response.status_code == 200:
        return response.text
    else:
        raise Exception('LinkedData responded with code {}: {}. {}'.format(response.status_code, response.reason, url))


# Extract the ECLI code from the LIDO identifier of the cited case law from the XML response from LIDO API
def get_ecli(sub_ref):
    return sub_ref.attrib['idref'].split('/')[-1]


# Extract the LIDO identifier of the cited legislation from the XML response from LIDO API
def get_legislation_identifier(sub_ref):
    return sub_ref.attrib['idref']


# Find the webpage expressing, in writing, the legislation referred to by the input LIDO identifier
def get_legislation_webpage(identifier):
    idcomponents = identifier.split("/")
    date = idcomponents[len(idcomponents) - 1]
    url = identifier
    page = urllib.request.urlopen(url)
    g = rdflib.Graph()
    g.parse(page, format="xml")
    article = ""
    for s, p, o in g:
        if str(p) == "http://purl.org/dc/terms/identifier":
            article = o
            if date in str(o):
                return o

    return article


def get_legislation_name(url, username, password):
    # turn the response into an xml tree
    xml_response = get_lido_response(url, username, password)
    xml = etree.fromstring(bytes(xml_response, encoding='utf8'))

    pref_label = ""
    title = ""
    # RDF main element (root)
    for element in xml.iterchildren():
        # there is only one child and it is the "description" in which the rest of the info is
        # go through all the tags (all the info)
        for el in element.iterchildren():
            # the title (same thing as the preLabel) is the feature we want to be using
            if el.tag == "{http://purl.org/dc/terms/}title":
                title = el.text

    return title


# Check if outgoing links in the XML response from the LIDO API are of type "Jurisprudentie" (case law)
def is_case_law(sub_ref):
    return sub_ref.attrib['groep'] == 'Jurisprudentie'


# Check if outgoing links in the XML response from the LIDO API are of type "Wet" (legislation)
def is_legislation(sub_ref):
    return sub_ref.attrib['groep'] == 'Wet' or sub_ref.attrib['groep'] == 'Artikel'


# Extract ECLI code of citation from a lido identifier.
# Example of a LIDO identifier "https://linkeddata.overheid.nl/terms/bwb/id/BWBR0020368/8655654/2016-08-11/2016-08-11"
def get_lido_id(ecli):
    return "http://linkeddata.overheid.nl/terms/jurisprudentie/id/" + ecli


# Method written by Marion
"""
These methods are used to write the citations incrementally to the csv file (in case it crashes or times out).
It allows us to stop the script whenever we want without loosing our data, and without having to start from the bginning the next time.
"""


# Main method to execute LIDO API call on a list of ECLIs from a CSV file and extract the citations of each
# Add the implementation of the incremental writing of rows
def find_citations_for_cases(dataframe, username, password):
    df_eclis = dataframe.reset_index(drop=True)

    eclis = list(df_eclis['ecli'].dropna())
    total_incoming = []
    total_outgoing = []
    total_legislations = []

    for i, ecli in enumerate(eclis):
        case_citations_incoming, case_citations_outgoing, legislation_citations = find_citations_for_case(
            remove_spaces_from_ecli(ecli), case_citations_fieldnames, legislation_citations_fieldnames, username,
            password)
        if case_citations_incoming:
            total_incoming.extend(case_citations_incoming)
        if case_citations_outgoing:
            total_outgoing.extend(case_citations_outgoing)
        if legislation_citations:
            total_legislations.extend(legislation_citations)
    df_incoming = pd.DataFrame(total_incoming)
    df_outgoing = pd.DataFrame(total_outgoing)
    df_legislations = pd.DataFrame(total_legislations)
    return df_incoming, df_outgoing, df_legislations


def citations_multithread_single(big_incoming, big_outgoing, big_legislations, ecli, username, password, current_index,bar):
    incoming_df = pd.Series([], dtype='string')
    outgoing_df = pd.Series([], dtype='string')
    legislations_df = pd.Series([], dtype='string')
    for i, ecli in enumerate(ecli):
        index = current_index + i
        case_citations_incoming, case_citations_outgoing, legislation_citations = find_citations_for_case(
            remove_spaces_from_ecli(ecli), case_citations_fieldnames, legislation_citations_fieldnames, username,
            password)
        if case_citations_incoming:
            encoded = json.dumps(case_citations_incoming)
            incoming_df[index] = encoded
        if case_citations_outgoing:
            encoded = json.dumps(case_citations_outgoing)
            outgoing_df[index] = encoded
        if legislation_citations:
            encoded = json.dumps(legislation_citations)
            legislations_df[index] = encoded
        bar.update(1)
    big_incoming.append(incoming_df)
    big_outgoing.append(outgoing_df)
    big_legislations.append(legislations_df)


def add_column_frow_list(data, name, list):
    column = pd.Series([], dtype='string')
    for l in list:
        column = column._append(l)
    column.sort_index(inplace=True)
    data.insert(1, name, column)


def find_citations_for_cases_multithread(dataframe, username, password, threads):
    ecli = dataframe['ecli'].dropna().reset_index(drop=True)
    length = ecli.size
    at_once_threads = int(length / threads)
    
    global big_incoming, big_outgoing, big_legislations
    
    big_incoming = []
    big_outgoing = []
    big_legislations = []
    threads = []
    bar = tqdm(total=length, colour="GREEN",position=0, leave=True,miniters=int(length/100),maxinterval=10000)
    for i in range(0, length, at_once_threads):
        curr_ecli = ecli[i:(i + at_once_threads)]
        t = threading.Thread(target=citations_multithread_single,
                             args=[big_incoming, big_outgoing, big_legislations, curr_ecli, username, password, i,bar])
        threads.append(t)
    for t in threads:
        t.start()
    for t in threads:
        t.join()
    add_column_frow_list(dataframe, 'citations_incoming', big_incoming)
    add_column_frow_list(dataframe, 'citations_outgoing', big_outgoing)
    add_column_frow_list(dataframe, 'legislations_cited', big_legislations)
    return dataframe


def add_citations_no_duplicates(already_existing_list, element):
    duplicate = False
    new_ecli = get_ecli(element)
    added_sth_new = True
    for stored in already_existing_list:
        if stored[target_ecli] == new_ecli:
            added_sth_new = False
            duplicate = True
            break
    if not duplicate:
        already_existing_list.append({target_ecli: new_ecli,
                                      label: element.attrib['label'],
                                      type: element.attrib['type'].split('/id/')[1]})
    return added_sth_new


def add_legislations_no_duplicates(list, element):
    duplicate = False
    new_legislation = get_legislation_identifier(element)
    added_sth_new = True
    for legs in list:
        if new_legislation == legs:
            added_sth_new = False
            duplicate = True
            break
    if not duplicate:
        list.append(get_legislation_identifier(element))
    return added_sth_new


# Main method to execute LIDO API call on the ECLI code of the input case and extract the citations
def find_citations_for_case(ecli, case_citations_fieldnames, legislation_citations_fieldnames, username, password):
    xml_elements = []
    case_law_citations_outgoing = []
    legislation_citations = []
    case_law_citations_incoming = []
    start_page = 0
    end_of_pages = False
    outgoing = "uitgaande-links"
    incoming = "inkomende-links"

    while not end_of_pages:
        added_sth_new = False
        url = "{}?id={}&start={}&rows={}&output=xml".format(LIDO_ENDPOINT, get_lido_id(ecli), start_page, 100)
        start_page += 1

        xml_text = get_lido_response(url, username, password)
        xml_elements.append(etree.fromstring(xml_text.encode('utf8')))

        for el in xml_elements:

            for sub in list(el.iterchildren('subject')):

                for the_citations in sub.iterchildren(outgoing):
                    for sub_ref in the_citations.iterchildren():
                        if is_case_law(sub_ref):
                            added_sth_new = add_citations_no_duplicates(case_law_citations_outgoing, sub_ref)
                        elif is_legislation(sub_ref):
                            added_sth_new = add_legislations_no_duplicates(legislation_citations, sub_ref)

                for the_citations in sub.iterchildren(incoming):
                    for sub_ref in the_citations.iterchildren():
                        if is_case_law(sub_ref):
                            added_sth_new = add_citations_no_duplicates(case_law_citations_incoming, sub_ref)

        if not added_sth_new or start_page > 15:
            #print(start_page)
            end_of_pages = True

    # Remove duplicates empties

    for item in case_law_citations_incoming:
        if item[target_ecli] == "":
            case_law_citations_incoming.remove(item)
    for item in case_law_citations_outgoing:
        if item[target_ecli] == "":
            case_law_citations_outgoing.remove(item)

    # Remove input case ECLI (for some reason a case can cite itself...)
    for dicts in case_law_citations_incoming:
        if dicts[target_ecli] == remove_spaces_from_ecli(ecli):
            case_law_citations_incoming.remove(dicts)
            break
    for dicts in case_law_citations_outgoing:
        if dicts[target_ecli] == remove_spaces_from_ecli(ecli):
            case_law_citations_outgoing.remove(dicts)
            break
    if (remove_spaces_from_ecli(ecli) in case_law_citations_incoming):
        case_law_citations_incoming.remove(remove_spaces_from_ecli(ecli))

    case_law_result_outgoing = extract_results_citations(case_law_citations_outgoing, ecli, case_citations_fieldnames)
    case_law_results_incoming = extract_results_citations(case_law_citations_incoming, ecli, case_citations_fieldnames)
    legislation_results = extract_results_legislations(legislation_citations, ecli, legislation_citations_fieldnames,
                                                       username, password)

    return case_law_results_incoming, case_law_result_outgoing, legislation_results


def extract_results_citations(list, ecli, fields):
    list_of_all_results = []

    for case_citation in list:
        case_law_result = {key: None for key in fields}
        case_law_result[fields[0]] = (remove_spaces_from_ecli(case_citation[target_ecli]))  # Target ECLI
        case_law_result[fields[1]] = (case_citation['label'])  # Target ECLI
        case_law_result[fields[2]] = (case_citation['type'])  # Target ECLI
        list_of_all_results.append(case_law_result)
    return list_of_all_results


def extract_results_legislations(list, ecli, fields, username, password):
    list_of_all_results = []

    for leg_citation in list:
        legislation_result = {key: None for key in fields}
        legislation_result[fields[0]] = (leg_citation)  # Target article
        legislation_result[fields[1]] = (get_legislation_webpage(leg_citation))  # Target article webpage
        legislation_result[fields[2]] = (
            get_legislation_name(leg_citation, username, password))  # pref label == article name
        list_of_all_results.append(legislation_result)
    return list_of_all_results


def get_citations(dataframe=None, username="", password="", threads=1):
    if dataframe is None or not username or not password:
        print("Incorrect arguments passed. Returning...")
        return False
    try:
        get_lido_response(LIDO_ENDPOINT,username,password)
    except:
        print('LIDO cannot be accessed with these login details. Returning...')
        return False
    print('\n--- START OF RS CITATIONS EXTRACTIONS ---\n')

    # find citations, and save the file incrementally
    df = find_citations_for_cases_multithread(dataframe, username, password, threads)

    print("\n--- DONE ---")
    return df

In [None]:
import rechtspraak_citations_extractor as rex_citations


## Get ecli's HR

In [None]:
from_values = range(1,44001,1000)


In [None]:
json_rechtspraak = []

for from_value in from_values:
    new_rechtspraak = get_rechtspraak(max_ecli=1000, sd='1900-01-01', ed='2023-12-31', save_file='n', from_value=from_value, instantie="http://standaarden.overheid.nl/owms/terms/Hoge_Raad_der_Nederlanden")
    print(from_value, new_rechtspraak[-1]['id'])
    for i in range(0, len(new_rechtspraak)):
      json_rechtspraak.append(new_rechtspraak[i])




In [None]:
df = save_csv(json_rechtspraak, 'x', 'n')

In [None]:
# to csv
save_csv(json_rechtspraak, 'HR_rechtspraak', 'y')

In [None]:
#to pickle
df.to_pickle("HR_rechtspraak.pkl")

## Get metadata HR

In [None]:
get_rechtspraak_metadata(save_file='y', dataframe=df)

## Get citations HR

In [None]:
df_part1 = df.iloc[:10000,:]
df_part2 = df.iloc[10000:20000,:]
df_part3 = df.iloc[20000:30000,:]
df_part4 = df.iloc[30000:40000,:]
df_part5 = df.iloc[40000:44718,:]


In [None]:
print(df_part1.shape)
print(df_part2.shape) 
print(df_part3.shape)
print(df_part4.shape) 
print(df_part5.shape)

In [None]:
df_citations1 = pd.read_csv("data\HR_rechtspraak_metadata_citations_part1.csv")

In [None]:
df_citations1 = df_citations1.drop(df_citations1.columns[0], axis=1)

In [None]:
df_citations1.shape

In [None]:
df_citations = get_citations(dataframe=df_part1, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=3)

In [None]:
df_citations

In [None]:
df_citations["citations_incoming"].nunique()

In [None]:
df_citations.to_csv("data\HR_rechtspraak_metadata_citations_part1.csv")

In [None]:
df_part2_1 = df_part2.iloc[:2500,:].reset_index(drop=True)

In [None]:
df_part2_1 = df_part2.iloc[:2500,:].reset_index(drop=True)
df_part2_2 = df_part2.iloc[2500:5000,:].reset_index(drop=True)
df_part2_3 = df_part2.iloc[5000:7500,:].reset_index(drop=True)
df_part2_4 = df_part2.iloc[7500:10000,:].reset_index(drop=True)


In [None]:
df_part2_1

In [None]:
print(df_part2.shape)
print(df_part2_1.shape)
print(df_part2_2.shape)
print(df_part2_3.shape)
print(df_part2_4.shape)

In [None]:
df_citations2_1 = get_citations(dataframe=df_part2_1, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations2_1

In [None]:
df_citations2_1["citations_outgoing"].nunique()

In [None]:
df_citations2_1.to_csv("data\HR_rechtspraak_metadata_citations_part2_1.csv")

In [None]:
df_citations2_2 = get_citations(dataframe=df_part2_2, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations2_2

In [None]:
df_citations2_2["legislations_cited"].nunique()

In [None]:
df_citations2_2.to_csv("data\HR_rechtspraak_metadata_citations_part2_2.csv")

In [None]:
df_citations2_3 = get_citations(dataframe=df_part2_3, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations2_3

In [None]:
df_citations2_3.to_csv("data\HR_rechtspraak_metadata_citations_part2_3.csv")

In [None]:
df_citations2_4 = get_citations(dataframe=df_part2_4, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations2_4

In [None]:
df_citations2_4.to_csv("data\HR_rechtspraak_metadata_citations_part2_4.csv")

In [None]:
list_df_citations2 = [df_citations2_1, df_citations2_2, df_citations2_3, df_citations2_4]

In [None]:
df_citations2 = pd.concat(list_df_citations2, ignore_index=True)

In [None]:
print(df_citations2.shape)

In [None]:
df_part2.shape

In [None]:
df_citations2.to_csv("data\HR_rechtspraak_metadata_citations_part2.csv")

In [None]:
df_part3_1 = df_part3.iloc[:2500,:].reset_index(drop=True)
df_part3_2 = df_part3.iloc[2500:5000,:].reset_index(drop=True)
df_part3_3 = df_part3.iloc[5000:7500,:].reset_index(drop=True)
df_part3_4 = df_part3.iloc[7500:10000,:].reset_index(drop=True)

In [None]:
print(df_part3.shape)
print(df_part3_1.shape)
print(df_part3_2.shape)
print(df_part3_3.shape)
print(df_part3_4.shape)

In [None]:
df_citations3_1 = get_citations(dataframe=df_part3_1, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations3_1

In [None]:
df_citations3_1.to_csv("data\HR_rechtspraak_metadata_citations_part3_1.csv")

In [None]:
df_citations3_2 = get_citations(dataframe=df_part3_2, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations3_2

In [None]:
df_citations3_2["citations_incoming"].nunique()

In [None]:
df_citations3_2.to_csv("data\HR_rechtspraak_metadata_citations_part3_2.csv")

In [None]:
df_citations3_3 = get_citations(dataframe=df_part3_3, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations3_3

In [None]:
df_citations3_3["citations_incoming"].nunique()

In [None]:
df_citations3_3.to_csv("data\HR_rechtspraak_metadata_citations_part3_3.csv")

In [None]:
df_citations3_4 = get_citations(dataframe=df_part3_4, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations3_4

In [None]:
df_citations3_4["citations_incoming"].nunique()

In [None]:
df_citations3_4.to_csv("data\HR_rechtspraak_metadata_citations_part3_4.csv")

In [None]:
list_df_citations3 = [df_citations3_1, df_citations3_2, df_citations3_3, df_citations3_4]

In [None]:
df_citations3 = pd.concat(list_df_citations3, ignore_index=True)

In [None]:
print(df_citations3.shape)

In [None]:
df_part3.shape

In [None]:
df_citations3.to_csv("data\HR_rechtspraak_metadata_citations_part3.csv")

In [None]:
df_part4_1 = df_part4.iloc[:2500,:].reset_index(drop=True)

In [None]:
df_part4_1 = df_part4.iloc[:2500,:].reset_index(drop=True)
df_part4_2 = df_part4.iloc[2500:5000,:].reset_index(drop=True)
df_part4_3 = df_part4.iloc[5000:7500,:].reset_index(drop=True)
df_part4_4 = df_part4.iloc[7500:10000,:].reset_index(drop=True)

In [None]:
print(df_part4.shape)
print(df_part4_1.shape)
print(df_part4_2.shape)
print(df_part4_3.shape)
print(df_part4_4.shape)

In [None]:
df_citations4_1 = get_citations(dataframe=df_part4_1, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations4_1

In [None]:
df_citations4_1.to_csv("data\HR_rechtspraak_metadata_citations_part4_1.csv")

In [None]:
df_citations4_2 = get_citations(dataframe=df_part4_2, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations4_2 = df_part4_2

In [None]:
df_citations4_2

In [None]:
df_citations4_2.to_csv("data\HR_rechtspraak_metadata_citations_part4_2.csv")

In [None]:
df_citations4_3 = get_citations(dataframe=df_part4_3, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations4_3 = df_part4_3

In [None]:
df_citations4_3

In [None]:
df_citations4_3.to_csv("data\HR_rechtspraak_metadata_citations_part4_3.csv")

In [None]:
df_citations4_4 = get_citations(dataframe=df_part4_4, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations4_4

In [None]:
df_part4

In [None]:
df_part4

In [None]:
df_citations4_4.to_csv("data\HR_rechtspraak_metadata_citations_part4_4.csv")

In [None]:
list_df_citations4 = [df_citations4_1, df_citations4_2, df_citations4_3, df_citations4_4]


In [None]:
df_citations4 = pd.concat(list_df_citations4, ignore_index=True)


In [None]:
print(df_citations4.shape)

In [None]:
df_part4.shape


In [None]:
df_citations4.to_csv("data\HR_rechtspraak_metadata_citations_part4.csv")

In [None]:
df_part5

In [None]:
df_part5_1 = df_part5.iloc[:2500,:].reset_index(drop=True)
df_part5_2 = df_part5.iloc[2500:4717,:].reset_index(drop=True)

In [None]:
print(df_part5.shape)
print(df_part5_1.shape)
print(df_part5_2.shape)

In [None]:
df_part5

In [None]:
df_part5_2

In [None]:
df_citations5_1 = get_citations(dataframe=df_part5_1, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations5_1

In [None]:
df_citations5_1.to_csv("data\HR_rechtspraak_metadata_citations_part5_1.csv")

In [None]:
df_citations5_2 = get_citations(dataframe=df_part5_2, username='mputs', password='0Opy68ak2dF-tqnx81lJ', threads=2)

In [None]:
df_citations5_2

In [None]:
df_citations5_2.to_csv("data\HR_rechtspraak_metadata_citations_part5_2.csv")

In [None]:
list_df_citations5 = [df_citations5_1, df_citations5_2]


In [None]:
df_citations5 = pd.concat(list_df_citations5, ignore_index=True)


In [None]:
df_part5.shape

In [None]:
df_citations5.shape

In [None]:
list_df_full = [df_citations1, df_citations2, df_citations3, df_citations4, df_citations5]


In [None]:
df_citations_full = pd.concat(list_df_full, ignore_index=True)


In [None]:
df_citations_full

In [None]:
df_citations_full.shape

In [None]:
df_citations_full.dtypes

In [None]:
df_citations_full["ecli"].nunique()

In [None]:
df_citations_full.to_csv("data\HR_rechtspraak_metadata_citations_full.csv", index=False)

In [None]:
df_citations_full.to_pickle("data\HR_rechtspraak_metadata_citations_full.pkl")