### downloads all laws as xml and parses them and prints to console

In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
url = 'https://www.riigiteataja.ee/lyhendid.html'

r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links = soup.find_all('a')

law_url = 'https://www.riigiteataja.ee/akt/'
law_links = [link for link in links if law_url in link['href']]
zipped_law_links = [[x['href'], x.text, y.text] for x, y in zip(law_links[::2], law_links[1::2])]
zipped_law_links
for link in zipped_law_links:
    url = link[0] + '.xml'
    response = requests.get(url)
    response.encoding = 'utf-8'
    law_text = response.text
    link.append(law_text)
    print(zipped_law_links.index(link), '/', len(zipped_law_links), link[1], 'from:', url)

In [None]:
import pandas as pd

df = pd.DataFrame(zipped_law_links)
df.to_csv('law_links.csv')

In [1]:
import pandas as pd

df = pd.read_csv('law_links.csv', names=['link', 'title', 'short-title', 'xml'], header=0, index_col=0)
df

Unnamed: 0,link,title,short-title,xml
0,https://www.riigiteataja.ee/akt/113032019027,Abieluvararegistri seadus,AVRS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
1,https://www.riigiteataja.ee/akt/127052022029,Abipolitseiniku seadus,APolS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
2,https://www.riigiteataja.ee/akt/105052022005,Advokatuuriseadus,AdvS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
3,https://www.riigiteataja.ee/akt/116122022023,"Alkoholi-, tubaka-, kütuse- ja elektriaktsiisi...",ATKEAS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
4,https://www.riigiteataja.ee/akt/104012021006,Alkoholiseadus,AS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
...,...,...,...,...
366,https://www.riigiteataja.ee/akt/123122022024,Äriregistri seadus,ÄRS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
367,https://www.riigiteataja.ee/akt/123122022033,Äriseadustik,ÄS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
368,https://www.riigiteataja.ee/akt/122032022010,Ühistranspordiseadus,ÜTS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
369,https://www.riigiteataja.ee/akt/130122021020,Ühisveevärgi ja -kanalisatsiooni seadus,ÜVVKS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."


In [None]:
%pip install xmltodict

In [2]:
import xmltodict
import re

In [3]:
def xml_preprocess(xml):
    tag = lambda tag: fr'<({tag}[^>]*)>'
    # replace <viide> with <tavatekst> to preserve order as it's used for links in text
    # xml = xml.replace('<viide>', '<tavatekst>')
    # xml = xml.replace('</viide>', '</tavatekst>')
    xml = re.sub(tag('viide'), '<tavatekst>', xml)
    xml = re.sub(tag('/viide'), '</tavatekst>', xml)

    # remove <i> and </i> tags
    # xml = xml.replace('<i>', '')
    # xml = xml.replace('</i>', '')
    xml = re.sub(tag('i'), '', xml)
    xml = re.sub(tag('/i'), '', xml)

    # replace <sup> with underscore and remove </sup> tags
    # xml = xml.replace('<sup>', '_')
    # xml = xml.replace('</sup>', '')
    xml = re.sub(tag('sup'), '_', xml)
    xml = re.sub(tag('/sup'), '', xml)

    # remove <reavahetus/>
    # xml = xml.replace('<reavahetus/>', '')
    xml = re.sub(tag('reavahetus'), '', xml)

    return xmltodict.parse(xml)

In [None]:
# data_dict['oigusakt']['metaandmed']['lyhend']
# data_dict['oigusakt']['metaandmed']['kehtivus']['kehtivuseAlgus']
# data_dict['oigusakt']['aktinimi']['nimi']['pealkiri']

In [50]:
from collections import namedtuple

def get_list(d, key):
    if key not in d:
        return []
    if isinstance(d[key], list):
        return d[key]
    return [d[key]]

def get_or_default(d, key, default):
    if type(d) is dict:
        if key in d:
            if d[key] is not None:
                return d[key]
    return default

def get_text(v):
    return get_or_default(v, '#text', v)
    
def get_text_from_arr(value):
    if isinstance(value, list) and len(value) > 0:
        values = [get_or_default(v, 'kuvatavTekst', v) for v in value]
        values = [get_text(v) for v in values]
        values = [v for v in values if v is not None]
        if values is not None:
            return ''.join(values)
    return ''

NUMBERING = namedtuple('Numbering', ['inactive', 'num', 'idx'])
def get_numbering(dict, key, ofList):
    number = get_or_default(dict, key, None)
    active = get_or_default(number, '@kehtiv', '1')
    numberComplex = get_text(number)
    numberIndex = get_or_default(number, '@ylaIndeks', None)
    if numberComplex is dict:
        numberComplex = ofList.index(dict) + 1
    return NUMBERING(active == '0', numberComplex, numberIndex)

def print_line(depth = 0, group = '', nr = '', nridx = ' ', text = ''):
    print('\t'*depth, group, nr, nridx, text)


In [51]:
from __future__ import annotations
from dataclasses import dataclass, field

TEXTREF = namedtuple('TextReference', ['text', 'reference'])

@dataclass
class Element:
    number: int = None
    index: int = None
    text: str = None
    children: list[Element] = field(default_factory=list)

    def add(self, child: Element):
        self.children.append(child)

    def get_text(self, max_length: int):
        # Collect text from this element and its children.
        texts = [TEXTREF(self.text, self)]
        for child in self.children:
            texts.extend(child.get_text(max_length))
        return texts

class Document(Element):
    pass

class Part(Element):
    def add(self, child):
        assert type(child) not in [Document, Part], f"Invalid child element {child}"
        super().add(child)

class Chapter(Element):
    def add(self, child):
        assert type(child) not in [Document, Part, Chapter], f"Invalid child element {child}"
        super().add(child)

class Section(Element):
    def add(self, child):
        assert type(child) not in [Document, Part, Chapter, Section], f"Invalid child element {child}"
        super().add(child)

class Paragraph(Element):
    def add(self, child):
        assert type(child) not in [Document, Part, Chapter, Section, Paragraph], f"Invalid child element {child}"
        super().add(child)

    def get_text(self, max_length):
        # Split the paragraph into multiple strings if it exceeds max_length.
        texts = []
        current_text = ""
        for child in self.children:
            if len(current_text) + len(child.text) > max_length:
                texts.append(TEXTREF(current_text, self))  # Attach a reference to the Paragrahv object.
                current_text = child.text
            else:
                current_text += " " + child.text
        texts.append(TEXTREF(current_text.strip(), self))  # Attach a reference to the Paragrahv object.
        return texts

class Subparagraph(Element):
    def add(self, child):
        assert type(child) not in [Document, Part, Chapter, Section, Paragraph, Subparagraph], f"Invalid child element {child}"
        super().add(child)

class Point(Element):
    def add(self, child):
        assert type(child) not in [Document, Part, Chapter, Section, Paragraph, Subparagraph, Point], f"Invalid child element {child}"
        super().add(child)

In [52]:
def parse_points(points, parent: Element):
    for point in points:
        numbering = get_numbering(point, 'alampunktNr', points)
        texts = get_or_default(point, 'sisuTekst', {})
        text = get_text_from_arr(get_list(texts, 'tavatekst'))
        # print_line(6, 'subpoint', subp_n.num, subp_n.idx, subp_text)
        if numbering.inactive:
            continue

        element = Point(number=numbering.num, index=numbering.idx, text=text)
        parent.add(element)
        get_next(point, element)
        
def parse_subparagraphs(subparagraphs, parent: Element):
    for subparagraph in subparagraphs:
        numbering = get_numbering(subparagraph, 'loigeNr', subparagraphs)
        texts = get_or_default(subparagraph, 'sisuTekst', {})
        text = get_text_from_arr(get_list(texts, 'tavatekst'))
        # print_line(5, 'subparagraph', numbering.num, numbering.idx, text)
        if numbering.inactive:
            continue

        element = Subparagraph(number=numbering.num, index=numbering.idx, text=text)
        parent.add(element)
        get_next(subparagraph, element)

def parse_paragraphs(paragraphs, parent: Document):
    for paragraph in paragraphs:
        numbering = get_numbering(paragraph, 'paragrahvNr', paragraphs)
        title = get_or_default(paragraph, 'paragrahvPealkiri', 'none')
        title = get_text(title)
        # print_line(4, 'paragraph', numbering.num, numbering.idx, title)
        if numbering.inactive:
            continue

        element = Paragraph(number=numbering.num, index=numbering.idx, text=title)
        parent.add(element)
        get_next(paragraph, element)
    
def parse_sections(sections, parent: Document):
    for section in sections:
        numbering = get_numbering(section, 'jaguNr', sections)
        title = get_or_default(section, 'jaguPealkiri', 'none')
        title = get_text(title)
        # print_line(3, 'section', sect_n.num, sect_n.idx, sect_title)
        if numbering.inactive:
            continue
        
        element = Section(number=numbering.num, index=numbering.idx, text=title)
        parent.add(element)
        get_next(section, element)
        
def parse_chapters(chapters, parent: Document):
    for chapter in chapters:
        numbering = get_numbering(chapter, 'peatykkNr', chapters)
        title = get_or_default(chapter, 'peatykkPealkiri', 'none')
        title = get_text(title)
        # print_line(2, 'chapter', ch_n.nr, ch_n.idx, ch_title)
        if numbering.inactive:
            continue

        element = Chapter(number=numbering.num, index=numbering.idx, text=title)
        parent.add(element)
        get_next(chapter, element)

def parse_parts(parts, parent: Document):
    for part in parts:
        numbering = get_numbering(part, 'osaNr', parts)
        title = get_or_default(part, 'osaPealkiri', 'none')
        title = get_text(title)
        # print_line(1, 'part', part_n.nr, part_n.idx, partTitle)
        if numbering.inactive:
            continue

        element = Part(number=numbering.num, index=numbering.idx, text=title)
        parent.add(element)
        get_next(part, element)

KEYS = ['osa', 'peatykk', 'jagu', 'paragrahv', 'loige', 'alampunkt']
PARSERS = [parse_parts, parse_chapters, parse_sections, parse_paragraphs, parse_subparagraphs, parse_points]

def get_next(xml_dict, parent: Element):
    for idx, key in enumerate(KEYS):
        if key in xml_dict:
            PARSERS[idx](get_list(xml_dict, key), parent)

In [60]:
# makes data structures from a law document
for idx, row in df.iterrows():
    print('--------------------------------------')
    print(row['title'])
    doc = Document()
    get_next(xml_preprocess(row['xml'])['oigusakt']['sisu'], doc)
    break # only test 1 at the moment

[text[0] for text in doc.get_text(200)]

--------------------------------------
Abieluvararegistri seadus


[None,
 'ÜLDSÄTTED',
 '',
 'Abieluvararegister on riiklik register, kuhu kantakse abieluvaralepingus toodud ning seaduses sätestatud juhtudel varalised õigused. Abieluvararegistri eesmärk on võimaldada saada kolmandatel isikutel teavet abikaasadevahelise varasuhte ning sellest tulenevate õiguste ja kohustuste kohta. Kui abieluvararegistris puuduvad andmed abikaasade kohta, siis eeldatakse, et abikaasade varalistele suhetele kohaldatakse varaühisuse varasuhet.',
 'Abieluvararegistrit peetakse elektrooniliselt. Abieluvararegistri pidamise ja andmetöötluse korra kehtestabvaldkonna eest vastutav ministermäärusega. ',
 'Abieluvararegistrile ja selle pidamisele kohaldatakse avaliku teabe seaduses andmekogude kohta sätestatut käesolevas seaduses sätestatud erisustega.',
 ' ',
 'Abieluvararegistri vastutav töötleja on Notarite Koda ning volitatud töötlejad on abielu sõlmimise kinnitanud või vaimuliku kinnitatud abielu paberil abielukandelt andmehõive teinud perekonnaseisuametnikud, notarid nin