### downloads all laws as xml and parses them and prints to console

In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
url = 'https://www.riigiteataja.ee/lyhendid.html'

r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links = soup.find_all('a')

law_url = 'https://www.riigiteataja.ee/akt/'
law_links = [link for link in links if law_url in link['href']]
zipped_law_links = [[x['href'], x.text, y.text] for x, y in zip(law_links[::2], law_links[1::2])]
zipped_law_links
for link in zipped_law_links:
    url = link[0] + '.xml'
    response = requests.get(url)
    response.encoding = 'utf-8'
    law_text = response.text
    link.append(law_text)
    print(zipped_law_links.index(link), '/', len(zipped_law_links), link[1], 'from:', url)

In [None]:
import pandas as pd

df = pd.DataFrame(zipped_law_links)
df.to_csv('law_links.csv')

In [5]:
import pandas as pd

df = pd.read_csv('law_links.csv', names=['link', 'title', 'short-title', 'xml'], header=0, index_col=0)
df

Unnamed: 0,link,title,short-title,xml
0,https://www.riigiteataja.ee/akt/113032019027,Abieluvararegistri seadus,AVRS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
1,https://www.riigiteataja.ee/akt/127052022029,Abipolitseiniku seadus,APolS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
2,https://www.riigiteataja.ee/akt/105052022005,Advokatuuriseadus,AdvS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
3,https://www.riigiteataja.ee/akt/116122022023,"Alkoholi-, tubaka-, kütuse- ja elektriaktsiisi...",ATKEAS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
4,https://www.riigiteataja.ee/akt/104012021006,Alkoholiseadus,AS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
...,...,...,...,...
366,https://www.riigiteataja.ee/akt/123122022024,Äriregistri seadus,ÄRS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
367,https://www.riigiteataja.ee/akt/123122022033,Äriseadustik,ÄS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
368,https://www.riigiteataja.ee/akt/122032022010,Ühistranspordiseadus,ÜTS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
369,https://www.riigiteataja.ee/akt/130122021020,Ühisveevärgi ja -kanalisatsiooni seadus,ÜVVKS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."


In [None]:
%pip install xmltodict

In [6]:
import xmltodict
import re

In [7]:
def xml_preprocess(xml):
    tag = lambda tag: fr'<({tag}[^>]*)>'
    # replace <viide> with <tavatekst> to preserve order as it's used for links in text
    xml = re.sub(tag('viide'), '<tavatekst>', xml)
    xml = re.sub(tag('/viide'), '</tavatekst>', xml)

    # remove <i> and </i> tags
    xml = re.sub(tag('i'), '', xml)
    xml = re.sub(tag('/i'), '', xml)

    # replace <sup> with ^ and remove </sup> tags
    xml = re.sub(tag('sup'), '^', xml)
    xml = re.sub(tag('/sup'), '', xml)

    # remove <reavahetus/>
    xml = re.sub(tag('reavahetus'), '', xml)

    # replace non breaking space
    xml = xml.replace(u'\xa0', u' ')

    return xmltodict.parse(xml)

In [4]:
# data_dict['oigusakt']['metaandmed']['lyhend']
# data_dict['oigusakt']['metaandmed']['kehtivus']['kehtivuseAlgus']
# data_dict['oigusakt']['aktinimi']['nimi']['pealkiri']

In [8]:
from collections import namedtuple

def get_list(d, key):
    if key not in d:
        return []
    if isinstance(d[key], list):
        return d[key]
    return [d[key]]

def get_or_default(d, key, default):
    if type(d) is dict:
        if key in d:
            if d[key] is not None:
                return d[key]
    return default

def get_text(v):
    return get_or_default(v, '#text', v)
    
def get_text_from_arr(value):
    if isinstance(value, list) and len(value) > 0:
        values = [get_or_default(v, 'kuvatavTekst', v) for v in value]
        values = [get_text(v) for v in values]
        values = [v for v in values if v is not None]
        if values is not None:
            return ''.join(values)
    return ''

NUMBERING = namedtuple('Numbering', ['inactive', 'num', 'idx'])
def get_numbering(dict, key, ofList):
    number = get_or_default(dict, key, None)
    active = get_or_default(number, '@kehtiv', '1')
    numberComplex = get_text(number)
    numberIndex = get_or_default(number, '@ylaIndeks', None)
    if numberComplex is dict:
        numberComplex = ofList.index(dict) + 1
    return NUMBERING(active == '0', numberComplex, numberIndex)

def print_line(depth = 0, group = '', nr = '', nridx = ' ', text = ''):
    print('\t'*depth, group, nr, nridx, text)


In [112]:
from __future__ import annotations
from dataclasses import dataclass, field

TEXTREF = namedtuple('TextReference', ['text', 'reference'])

def get_string_from_list(generator, sep=' '):
    if generator is not None:
        l = list(generator)
        if len(l) > 0:
            return sep.join(l)
    return ""

@dataclass
class Element:
    number: int = None
    index: int = None
    text: str = None
    children: list[Element] = field(default_factory=list)

    def add(self, child: Element):
        self.children.append(child)

    def numbering(self):
        pass

    def format_text(self):
        return ""

    def get_text(self, max_length: int = 0):
        texts = []
        for child in self.children:
            child_text = child.get_text(max_length)
            if child_text is not None and len(child_text) > 0:
                texts.append(child_text)
        return texts
    
    def get_references(self):
        pass


class Document(Element):
    pass


class Part(Element):
    def add(self, child):
        assert type(child) not in [Document, Part], f"Invalid child element {child}"
        super().add(child)


class Chapter(Element):
    def add(self, child):
        assert type(child) not in [Document, Part, Chapter], f"Invalid child element {child}"
        super().add(child)


class Section(Element):
    def add(self, child):
        assert type(child) not in [Document, Part, Chapter, Section], f"Invalid child element {child}"
        super().add(child)


class Paragraph(Element):
    def add(self, child):
        assert type(child) not in [Document, Part, Chapter, Section, Paragraph], f"Invalid child element {child}"
        super().add(child)
    
    def numbering(self):
        if self.number is not None:
            if self.index is not None:
                return f"§{self.number}^{self.index}."
            return f"§{self.number}."
        return ""
    
    def format_text(self):
        if self.number is not None:
            if self.text is not None:
                return f"{self.numbering()} {self.text} "
            return f"{self.numbering()} "
        return ""

    def get_text(self, max_length):
        texts = []
        text = self.format_text()
        for child in self.children:
            child_text = child.get_text()
            if len(text) + len(child_text) > max_length:
                texts.append(text.strip())
                text = child_text
            else:
                text += f" {child_text}".strip()
        if len(text) > 0 and len(text) > len(self.format_text()):
            texts.append(text.strip())
        return texts


class Subparagraph(Element):
    def add(self, child):
        assert type(child) not in [Document, Part, Chapter, Section, Paragraph, Subparagraph], f"Invalid child element {child}"
        super().add(child)

    def numbering(self):
        if self.number is not None:
            if self.index is not None:
                return f"({self.number}^{self.index})"
            return f"({self.number})"
        return ""
    
    
    def get_child_texts(self):
        for child in self.children:
            yield child.get_text()

    def format_text(self):
        if self.text is not None and len(self.text.strip()) > 0:
            return f"{self.numbering()} {self.text} {get_string_from_list(self.get_child_texts())}"
        return ""
    
    def get_text(self, max_length: int = 0):
        # self.get_references()
        return self.format_text()
    
    def get_references(self):
        if self.text is not None and '§' in self.text:
            splits = self.text.split('§')
            splits2 = [""] * (len(splits) - 1)
            for idx, split in enumerate(splits):
                if idx == 0:
                    splits2[0] = split
                elif idx == 1:
                    splits2[0] += '§' + split
                else:
                    splits2[idx-1] = '§' + split
            print('s: ', splits2)


class Point(Element):
    def add(self, child):
        assert type(child) not in [Document, Part, Chapter, Section, Paragraph, Subparagraph, Point], f"Invalid child element {child}"
        super().add(child)
    
    def numbering(self):
        if self.number is not None:
            if self.index is not None:
                return f"{self.number}^{self.index})"
            return f"{self.number})"
        return ""
    
    def format_text(self):
        if self.text is not None and len(self.text.strip()) > 0:
            return f"{self.numbering()} {self.text}"
        return ""
    
    def get_text(self, max_length: int = 0):
        # self.get_references()
        return self.format_text()
    
    def get_references(self):
        # unfinished, only splits into possibly separate references for now
        if self.text is not None and '§' in self.text:
            splits = self.text.split('§')
            splits2 = [""] * (len(splits) - 1)
            for idx, split in enumerate(splits):
                if idx == 0:
                    splits2[0] = split
                elif idx == 1:
                    splits2[0] += '§' + split
                else:
                    splits2[idx-1] = '§' + split
            print('p: ', splits2)

In [77]:
def parse_points(points, parent: Element):
    for point in points:
        numbering = get_numbering(point, 'alampunktNr', points)
        texts = get_or_default(point, 'sisuTekst', {})
        text = get_text_from_arr(get_list(texts, 'tavatekst'))
        # print_line(6, 'subpoint', subp_n.num, subp_n.idx, subp_text)
        if numbering.inactive:
            continue

        element = Point(number=numbering.num, index=numbering.idx, text=text)
        parent.add(element)
        get_next(point, element)
        
def parse_subparagraphs(subparagraphs, parent: Element):
    for subparagraph in subparagraphs:
        numbering = get_numbering(subparagraph, 'loigeNr', subparagraphs)
        texts = get_or_default(subparagraph, 'sisuTekst', {})
        text = get_text_from_arr(get_list(texts, 'tavatekst'))
        # print_line(5, 'subparagraph', numbering.num, numbering.idx, text)
        if numbering.inactive:
            continue

        element = Subparagraph(number=numbering.num, index=numbering.idx, text=text)
        parent.add(element)
        get_next(subparagraph, element)

def parse_paragraphs(paragraphs, parent: Document):
    for paragraph in paragraphs:
        numbering = get_numbering(paragraph, 'paragrahvNr', paragraphs)
        title = get_or_default(paragraph, 'paragrahvPealkiri', None)
        title = get_text(title)
        # print_line(4, 'paragraph', numbering.num, numbering.idx, title)
        if numbering.inactive:
            continue

        element = Paragraph(number=numbering.num, index=numbering.idx, text=title)
        parent.add(element)
        get_next(paragraph, element)
    
def parse_sections(sections, parent: Document):
    for section in sections:
        numbering = get_numbering(section, 'jaguNr', sections)
        title = get_or_default(section, 'jaguPealkiri', None)
        title = get_text(title)
        # print_line(3, 'section', sect_n.num, sect_n.idx, sect_title)
        if numbering.inactive:
            continue
        
        element = Section(number=numbering.num, index=numbering.idx, text=title)
        parent.add(element)
        get_next(section, element)
        
def parse_chapters(chapters, parent: Document):
    for chapter in chapters:
        numbering = get_numbering(chapter, 'peatykkNr', chapters)
        title = get_or_default(chapter, 'peatykkPealkiri', None)
        title = get_text(title)
        # print_line(2, 'chapter', ch_n.nr, ch_n.idx, ch_title)
        if numbering.inactive:
            continue

        element = Chapter(number=numbering.num, index=numbering.idx, text=title)
        parent.add(element)
        get_next(chapter, element)

def parse_parts(parts, parent: Document):
    for part in parts:
        numbering = get_numbering(part, 'osaNr', parts)
        title = get_or_default(part, 'osaPealkiri', None)
        title = get_text(title)
        # print_line(1, 'part', part_n.nr, part_n.idx, partTitle)
        if numbering.inactive:
            continue

        element = Part(number=numbering.num, index=numbering.idx, text=title)
        parent.add(element)
        get_next(part, element)

KEYS = ['osa', 'peatykk', 'jagu', 'paragrahv', 'loige', 'alampunkt']
PARSERS = [parse_parts, parse_chapters, parse_sections, parse_paragraphs, parse_subparagraphs, parse_points]

def get_next(xml_dict, parent: Element):
    for idx, key in enumerate(KEYS):
        if key in xml_dict:
            PARSERS[idx](get_list(xml_dict, key), parent)

In [113]:

from collections.abc import Iterable
def flatten(xs):
    # from https://stackoverflow.com/a/2158532
    for x in xs:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            yield from flatten(x)
        else:
            yield x
# makes data structures from a law document
# import pandas as pd
# df2 = pd.DataFrame(columns=['law', 'paragraph', 'subparagraphs', 'points', 'text'])

for idx, row in df.iterrows():
    # if row['title'] == 'Eesti Vabariigi põhiseadus': # test with põhiseadus
    print('--------------------------------------')
    print(row['title'])
    doc = Document()
    get_next(xml_preprocess(row['xml'])['oigusakt']['sisu'], doc)
    print(list(flatten(doc.get_text(2048))))
    if idx > 5: # only 5 for testing
        break


--------------------------------------
Abieluvararegistri seadus
['§1. Abieluvararegister (1) Abieluvararegister on riiklik register, kuhu kantakse abieluvaralepingus toodud ning seaduses sätestatud juhtudel varalised õigused. Abieluvararegistri eesmärk on võimaldada saada kolmandatel isikutel teavet abikaasadevahelise varasuhte ning sellest tulenevate õiguste ja kohustuste kohta. Kui abieluvararegistris puuduvad andmed abikaasade kohta, siis eeldatakse, et abikaasade varalistele suhetele kohaldatakse varaühisuse varasuhet.(1^1) Abieluvararegistrit peetakse elektrooniliselt.(2) Abieluvararegistri pidamise ja andmetöötluse korra kehtestabvaldkonna eest vastutav ministermäärusega.(3) Abieluvararegistrile ja selle pidamisele kohaldatakse avaliku teabe seaduses andmekogude kohta sätestatut käesolevas seaduses sätestatud erisustega.', '§2. Abieluvararegistri vastutav ja volitatud töötleja ning andmeandjad (1^1) Abieluvararegistri vastutav töötleja on Notarite Koda ning volitatud töötlejad o