### downloads all laws as xml and parses them and prints to console

In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
url = 'https://www.riigiteataja.ee/lyhendid.html'

r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links = soup.find_all('a')

law_url = 'https://www.riigiteataja.ee/akt/'
law_links = [link for link in links if law_url in link['href']]
zipped_law_links = [[x['href'], x.text, y.text] for x, y in zip(law_links[::2], law_links[1::2])]
zipped_law_links
for link in zipped_law_links:
    url = link[0] + '.xml'
    response = requests.get(url)
    response.encoding = 'utf-8'
    law_text = response.text
    link.append(law_text)
    print(zipped_law_links.index(link), '/', len(zipped_law_links), link[1], 'from:', url)

In [None]:
import pandas as pd

df = pd.DataFrame(zipped_law_links)
df.to_csv('law_links.csv')

In [None]:
import pandas as pd

df = pd.read_csv('law_links.csv', names=['link', 'title', 'short-title', 'xml'], header=0, index_col=0)
df

In [None]:
%pip install xmltodict

In [None]:
import xmltodict
import re

In [None]:
def xml_preprocess(xml):
    tag = lambda tag: fr'<({tag}[^>]*)>'
    # replace <viide> with <tavatekst> to preserve order as it's used for links in text
    # xml = xml.replace('<viide>', '<tavatekst>')
    # xml = xml.replace('</viide>', '</tavatekst>')
    xml = re.sub(tag('viide'), '<tavatekst>', xml)
    xml = re.sub(tag('/viide'), '</tavatekst>', xml)

    # remove <i> and </i> tags
    # xml = xml.replace('<i>', '')
    # xml = xml.replace('</i>', '')
    xml = re.sub(tag('i'), '', xml)
    xml = re.sub(tag('/i'), '', xml)

    # replace <sup> with underscore and remove </sup> tags
    # xml = xml.replace('<sup>', '_')
    # xml = xml.replace('</sup>', '')
    xml = re.sub(tag('sup'), '_', xml)
    xml = re.sub(tag('/sup'), '', xml)

    # remove <reavahetus/>
    # xml = xml.replace('<reavahetus/>', '')
    xml = re.sub(tag('reavahetus'), '', xml)

    return xmltodict.parse(xml)

In [None]:
# with open('./109082022006.akt', 'r') as file:
#     xml = file.read()
#     # replace <viide> with <tavatekst> to preserve order as it's used for links in text
#     xml = xml.replace('<viide>', '<tavatekst>')
#     xml = xml.replace('</viide>', '</tavatekst>')
#     # remove <i> and </i> tags
#     xml = xml.replace('<i>', '')
#     xml = xml.replace('</i>', '')
#     # replace <sup> with underscore and remove </sup> tags
#     xml = xml.replace('<sup>', '_')
#     xml = xml.replace('</sup>', '')
#     # remove <reavahetus/>
#     xml = xml.replace('<reavahetus/>', '')
#     print(xml)
#     data_dict = xmltodict.parse(xml)

In [None]:
# data_dict['oigusakt']['metaandmed']['lyhend']
# data_dict['oigusakt']['metaandmed']['kehtivus']['kehtivuseAlgus']
# data_dict['oigusakt']['aktinimi']['nimi']['pealkiri']

In [None]:
from collections import namedtuple

def get_list(d, key):
    if key not in d:
        return []
    if isinstance(d[key], list):
        return d[key]
    return [d[key]]

def get_or_default(d, key, default):
    if type(d) is dict:
        if key in d:
            if d[key] is not None:
                return d[key]
    return default

def get_text(v):
    return get_or_default(v, '#text', v)
    
def get_text_arr(value):
    if isinstance(value, list) and len(value) > 0:
        values = [get_or_default(v, 'kuvatavTekst', v) for v in value]
        values = [get_text(v) for v in values]
        values = [v for v in values if v is not None]
        if values is not None:
            return ''.join(values)
    return ''

NUMBERING = namedtuple('Numbering', ['inactive', 'nr', 'idx'])
def get_numbering(dict, key, ofList):
    number = get_or_default(dict, key, ofList.index(dict) + 1)
    active = get_or_default(number, '@kehtiv', '1')
    numberComplex = get_or_default(number, '#text', number)
    numberIndex = get_or_default(number, '@ylaIndeks', '')
    if numberComplex is dict:
        numberComplex = ofList.index(dict) + 1
    return NUMBERING(active == '0', numberComplex, numberIndex)

def print_line(depth = 0, group = '', nr = '', nridx = ' ', text = ''):
    print('\t'*depth, group, nr, nridx, text)


In [None]:
def parse_subpoints(subpoints):
    for subpoint in subpoints:
        subp_n = get_numbering(subpoint, 'alampunktNr', subpoints)
        subp_texts = get_or_default(subpoint, 'sisuTekst', {})
        subp_text = get_text_arr(get_list(subp_texts, 'tavatekst'))
        print_line(6, 'subpoint', subp_n.nr, subp_n.idx, subp_text)
        if subp_n.inactive:
            continue
        get_next(subpoint)
        
def parse_subparagraphs(subparagraphs):
    for subparagraph in subparagraphs:
        subpar_n = get_numbering(subparagraph, 'loigeNr', subparagraphs)
        subpar_texts = get_or_default(subparagraph, 'sisuTekst', {})
        subpar_text = get_text_arr(get_list(subpar_texts, 'tavatekst'))
        print_line(5, 'subparagraph', subpar_n.nr, subpar_n.idx, subpar_text)
        if subpar_n.inactive:
            continue
        get_next(subparagraph)

def parse_paragraphs(paragraphs):
    for paragraph in paragraphs:
        par_n = get_numbering(paragraph, 'paragrahvNr', paragraphs)
        par_title = get_or_default(paragraph, 'paragrahvPealkiri', 'none')
        par_title = get_text(par_title)
        # par_title = get_or_default(par_title, '#text', par_title)
        print_line(4, 'paragraph', par_n.nr, par_n.idx, par_title)
        if par_n.inactive:
            continue
        get_next(paragraph)
    
def parse_sections(sections):
    for section in sections:
        sect_n = get_numbering(section, 'jaguNr', sections)
        sect_title = get_or_default(section, 'jaguPealkiri', 'none')
        sect_title = get_text(sect_title)
        # sect_title = get_or_default(sect_title, '#text', sect_title)
        print_line(3, 'section', sect_n.nr, sect_n.idx, sect_title)
        if sect_n.inactive:
            continue
        get_next(section)
        
def parse_chapters(chapters):
    for chapter in chapters:
        ch_n = get_numbering(chapter, 'peatykkNr', chapters)
        ch_title = get_or_default(chapter, 'peatykkPealkiri', 'none')
        ch_title = get_text(ch_title)
        # ch_title = get_or_default(ch_title, '#text', ch_title)
        print_line(2, 'chapter', ch_n.nr, ch_n.idx, ch_title)
        if ch_n.inactive:
            continue
        get_next(chapter)

def parse_parts(parts):
    for part in parts:
        part_n = get_numbering(part, 'osaNr', parts)
        partTitle = get_or_default(part, 'osaPealkiri', 'none')
        partTitle = get_text(partTitle)
        print_line(1, 'part', part_n.nr, part_n.idx, partTitle)
        if part_n.inactive:
            continue
        get_next(part)

KEYS = ['osa', 'peatykk', 'jagu', 'paragrahv', 'loige', 'alampunkt']
PARSERS = [parse_parts, parse_chapters, parse_sections, parse_paragraphs, parse_subparagraphs, parse_subpoints]

def get_next(d):
    for idx, key in enumerate(KEYS):
        if key in d:
            PARSERS[idx](get_list(d, key))

In [None]:
#prints all to console, doesn't do anything
#should include everything except Riiklike elatusrahade seadus (lühend - RERS) (it doesn't have proper xml formatting) 
for idx, row in df.iterrows():
    print('--------------------------------------')
    print(row['title'])
    get_next(xml_preprocess(row['xml'])['oigusakt']['sisu'])