### downloads all laws as xml and parses them and prints to console

In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
url = 'https://www.riigiteataja.ee/lyhendid.html'

r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
links = soup.find_all('a')

law_url = 'https://www.riigiteataja.ee/akt/'
law_links = [link for link in links if law_url in link['href']]
zipped_law_links = [[x['href'], x.text, y.text] for x, y in zip(law_links[::2], law_links[1::2])]
zipped_law_links
for link in zipped_law_links:
    url = link[0] + '.xml'
    response = requests.get(url)
    response.encoding = 'utf-8'
    law_text = response.text
    link.append(law_text)
    print(zipped_law_links.index(link), '/', len(zipped_law_links), link[1], 'from:', url)

In [None]:
import pandas as pd

df = pd.DataFrame(zipped_law_links)
df.to_csv('law_links.csv')

### load downloaded laws from disk

In [7]:
import pandas as pd

df = pd.read_csv('law_links.csv', names=['link', 'title', 'short-title', 'xml'], header=0, index_col=0)
df

Unnamed: 0,link,title,short-title,xml
0,https://www.riigiteataja.ee/akt/113032019027,Abieluvararegistri seadus,AVRS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
1,https://www.riigiteataja.ee/akt/127052022029,Abipolitseiniku seadus,APolS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
2,https://www.riigiteataja.ee/akt/105052022005,Advokatuuriseadus,AdvS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
3,https://www.riigiteataja.ee/akt/116122022023,"Alkoholi-, tubaka-, kütuse- ja elektriaktsiisi...",ATKEAS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
4,https://www.riigiteataja.ee/akt/104012021006,Alkoholiseadus,AS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
...,...,...,...,...
366,https://www.riigiteataja.ee/akt/123122022024,Äriregistri seadus,ÄRS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
367,https://www.riigiteataja.ee/akt/123122022033,Äriseadustik,ÄS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
368,https://www.riigiteataja.ee/akt/122032022010,Ühistranspordiseadus,ÜTS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."
369,https://www.riigiteataja.ee/akt/130122021020,Ühisveevärgi ja -kanalisatsiooni seadus,ÜVVKS,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<oigus..."


### parse xml

In [40]:
import xmltodict
import re

def xml_preprocess(xml):
    tag = lambda tag: fr'<({tag}[^>]*)>'
    # replace <viide> with <tavatekst> to preserve order as it's used for links in text
    xml = re.sub(tag('viide'), '<tavatekst>', xml)
    xml = re.sub(tag('/viide'), '</tavatekst>', xml)

    # remove <i> and </i> tags
    xml = re.sub(tag('i'), '', xml)
    xml = re.sub(tag('/i'), '', xml)

    # replace <sup> with _ and remove </sup> tags
    xml = re.sub(tag('sup'), '_', xml)
    xml = re.sub(tag('/sup'), '', xml)

    # remove <reavahetus/>
    xml = re.sub(tag('reavahetus'), '', xml)

    # replace non breaking space
    xml = xml.replace(u'\xa0', u' ')

    # remove id="..." attributes
    xml = re.sub('id="([^"]*)"', '', xml)

    return xmltodict.parse(xml)

In [41]:
# xml parsing helper functions

from collections import namedtuple

def get_list(d, key):
    if key not in d:
        return []
    if isinstance(d[key], list):
        return d[key]
    return [d[key]]

def get_or_default(d, key, default):
    if type(d) is dict:
        if key in d:
            if d[key] is not None:
                return d[key]
    return default

def get_text(v):
    return get_or_default(v, '#text', v)
    
def get_text_from_arr(value):
    if isinstance(value, list) and len(value) > 0:
        values = [get_or_default(v, 'kuvatavTekst', v) for v in value]
        values = [get_text(v) for v in values]
        values = [v for v in values if v is not None]
        if values is not None:
            return ''.join(values)
    return ''

NUMBERING = namedtuple('Numbering', ['inactive', 'num', 'idx'])
def get_numbering(dict, key, ofList):
    number = get_or_default(dict, key, None)
    active = get_or_default(number, '@kehtiv', '1')
    numberComplex = get_text(number)
    numberIndex = get_or_default(number, '@ylaIndeks', None)
    if numberComplex is dict:
        numberComplex = ofList.index(dict) + 1
    return NUMBERING(active == '0', numberComplex, numberIndex)

def print_line(depth = 0, group = '', nr = '', nridx = ' ', text = ''):
    print('\t'*depth, group, nr, nridx, text)


In [127]:
# use chatgpt3.5 to parse in-text refs

import openai
from dotenv import dotenv_values
import tiktoken

GPT_MODEL = "gpt-3.5-turbo"
EMBEDDING_MODEL = "text-embedding-ada-002"

def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

openai.organization = "org-3O7bHGD9SwjHVDuUCNCGACC3"
openai.api_key = dotenv_values(".env")["OPENAI_API_KEY"]

def gpt35_get_refs(user):
    # system = "convert law references from the input text to the following format: seaduse_nimi, paragrahv, lõige, punkt. only one per line. use strings for all values. use an empty string for seaduse_nimi if the law references itself. paragraphs are marked with §. only respond with csv. e.g. notariseadus;41;; ;37;;"
    system = "convert law references from the input text to the following format: seaduse_nimi; paragrahv; lõige; punkt. \r\n only one per line. use strings for all values. use self for seaduse_nimi if the law references itself. paragraphs are marked with §. \r\n only respond with csv. e.g. notariseadus;41;;\r\n self;37;2;1\r\n"
    MODEL = "gpt-3.5-turbo"
    response = openai.ChatCompletion.create(
        model=GPT_MODEL,
        messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": user},
            ],
        temperature=0
    )
    return response

In [43]:
# test = "Käesoleva seaduse § 24 lõigetes 6_8 ja 6_11 nimetatud intensiivse gaasitarbimisega ettevõtja esitab hiljemalt 30. päeval sama paragrahvi lõike 6_8 punktides 1–3 ja lõikes 6_11 nimetatud sündmusest arvates aktsiisideklaratsiooni ning maksab aktsiisi."
# gpt35_get_refs(test)

In [146]:
# data structures to hold parsed xml data

from __future__ import annotations
from dataclasses import dataclass, field
from collections import namedtuple

TEXTELEM = namedtuple('TextElem', ['text', 'elem'])

def get_string_from_list(generator, sep=' '):
    if generator is not None:
        l = list(generator)
        if len(l) > 0:
            return sep.join(l)
    return ""

@dataclass
class Element:
    number: int = None
    index: int = None
    text: str = None
    children: list[Element] = field(default_factory=list)
    refs: object = None

    def add(self, child: Element):
        self.children.append(child)

    def numbering(self):
        pass

    def format_text(self):
        return ""

    def get_text(self, max_length: int = 0, current_tokens = 0, model: str = GPT_MODEL):
        texts = []
        for child in self.children:
            texts.extend(child.get_text(max_length, current_tokens, model))
        return texts

    def fetch_references(self):
        if self.text is not None and '§' in self.text:
            try:
                self.refs = gpt35_get_refs(self.text)
            except:
                pass
            # print(self.text)
        for child in self.children:
            child.fetch_references()
    
    def get_references(self):
        refs = [self.refs]
        for child in self.children:
            refs.extend(child.get_references())
        return refs



class Document(Element):
    pass


class Part(Element):
    pass


class Chapter(Element):
    pass


class Section(Element):
    pass


class Paragraph(Element):
    def numbering(self):
        if self.number is not None:
            if self.index is not None:
                return f"§{self.number}_{self.index}."
            return f"§{self.number}."
        return ""
    
    def format_text(self):
        if self.number is not None and len(self.children) > 0:
            if self.text is not None:
                return f"{self.numbering()} {self.text} "
            return f"{self.numbering()} "
        return ""

    def get_text(self, max_length: int = 0, current_tokens = 0, model: str = GPT_MODEL):
        texts = []
        current_text = self.format_text()
        current_tokens += num_tokens(current_text, model)
        for child in self.children:
            child_texts = child.get_text(max_length, max_length - current_tokens, model)
            for child_text in child_texts:
                if child_text is None or len(child_text.strip()) < 1:
                    continue
                child_tokens = num_tokens(child_text, model)
                if current_tokens + child_tokens > max_length:
                    if len(current_text.strip()) > 0:
                        texts.append(TEXTELEM(current_text, self))
                    current_text = child_text
                    current_tokens = child_tokens
                else:
                    current_text += child_text
                    current_tokens += child_tokens
        if current_text and len(current_text.strip()) > 0:
            texts.append(TEXTELEM(current_text, self))
        return texts


class Subparagraph(Element):
    def numbering(self):
        if self.number is not None:
            if self.index is not None:
                return f"({self.number}_{self.index})"
            return f"({self.number})"
        return ""
    
    
    def get_child_texts(self):
        for child in self.children:
            yield child.get_text()

    def format_text(self):
        if self.text is not None and len(self.text.strip()) > 0:
            return f"{self.numbering()} {self.text}"
        return ""
    
    def get_text(self, max_length: int = 0, current_tokens = 0, model: str = GPT_MODEL):
        texts = []
        current_text = self.format_text()
        current_tokens += num_tokens(current_text, model)
        for child in self.children:
            child_texts = child.get_text(max_length, max_length - current_tokens, model)
            for child_text in child_texts:
                if child_text is None or len(child_text.strip()) < 1:
                    continue
                child_tokens = num_tokens(child_text, model)
                if current_tokens + child_tokens > max_length:
                    if len(current_text.strip()) > 0:
                        texts.append(current_text)
                    current_text = child_text
                    current_tokens = child_tokens
                else:
                    current_text += child_text
                    current_tokens += child_tokens
        if current_text and len(current_text.strip()) > 0:
            texts.append(current_text)
        return texts


class Point(Element):    
    def numbering(self):
        if self.number is not None:
            if self.index is not None:
                return f"{self.number}_{self.index})"
            return f"{self.number})"
        return ""
    
    def format_text(self):
        if self.text is not None and len(self.text.strip()) > 0:
            return f"{self.numbering()} {self.text}"
        return ""
    
    def get_text(self, max_length: int = 0, current_tokens = 0, model: str = GPT_MODEL):
        texts = []
        current_text = self.format_text()
        current_tokens += num_tokens(current_text, model)
        for child in self.children:
            child_texts = child.get_text(max_length, max_length - current_tokens, model)
            for child_text in child_texts:
                if child_text is None or len(child_text.strip()) < 1:
                    continue
                child_tokens = num_tokens(child_text, model)
                if current_tokens + child_tokens > max_length:
                    if len(current_text.strip()) > 0:
                        texts.append(current_text)
                    current_text = child_text
                    current_tokens = child_tokens
                else:
                    current_text += child_text
                    current_tokens += child_tokens
        if current_text and len(current_text.strip()) > 0:
            texts.append(current_text)
        return texts

In [118]:
# xml parsing functions

def parse_points(points, parent: Element):
    for point in points:
        numbering = get_numbering(point, 'alampunktNr', points)
        texts = get_or_default(point, 'sisuTekst', {})
        text = get_text_from_arr(get_list(texts, 'tavatekst'))
        # print_line(6, 'subpoint', subp_n.num, subp_n.idx, subp_text)
        if numbering.inactive:
            continue

        element = Point(number=numbering.num, index=numbering.idx, text=text)
        parent.add(element)
        get_next(point, element)
        
def parse_subparagraphs(subparagraphs, parent: Element):
    for subparagraph in subparagraphs:
        numbering = get_numbering(subparagraph, 'loigeNr', subparagraphs)
        texts = get_or_default(subparagraph, 'sisuTekst', {})
        text = get_text_from_arr(get_list(texts, 'tavatekst'))
        # print_line(5, 'subparagraph', numbering.num, numbering.idx, text)
        if numbering.inactive:
            continue

        element = Subparagraph(number=numbering.num, index=numbering.idx, text=text)
        parent.add(element)
        get_next(subparagraph, element)

def parse_paragraphs(paragraphs, parent: Document):
    for paragraph in paragraphs:
        numbering = get_numbering(paragraph, 'paragrahvNr', paragraphs)
        title = get_or_default(paragraph, 'paragrahvPealkiri', None)
        title = get_text(title)
        # print_line(4, 'paragraph', numbering.num, numbering.idx, title)
        if numbering.inactive:
            continue

        element = Paragraph(number=numbering.num, index=numbering.idx, text=title)
        parent.add(element)
        get_next(paragraph, element)
    
def parse_sections(sections, parent: Document):
    for section in sections:
        numbering = get_numbering(section, 'jaguNr', sections)
        title = get_or_default(section, 'jaguPealkiri', None)
        title = get_text(title)
        # print_line(3, 'section', sect_n.num, sect_n.idx, sect_title)
        if numbering.inactive:
            continue
        
        element = Section(number=numbering.num, index=numbering.idx, text=title)
        parent.add(element)
        get_next(section, element)
        
def parse_chapters(chapters, parent: Document):
    for chapter in chapters:
        numbering = get_numbering(chapter, 'peatykkNr', chapters)
        title = get_or_default(chapter, 'peatykkPealkiri', None)
        title = get_text(title)
        # print_line(2, 'chapter', ch_n.nr, ch_n.idx, ch_title)
        if numbering.inactive:
            continue

        element = Chapter(number=numbering.num, index=numbering.idx, text=title)
        parent.add(element)
        get_next(chapter, element)

def parse_parts(parts, parent: Document):
    for part in parts:
        numbering = get_numbering(part, 'osaNr', parts)
        title = get_or_default(part, 'osaPealkiri', None)
        title = get_text(title)
        # print_line(1, 'part', part_n.nr, part_n.idx, partTitle)
        if numbering.inactive:
            continue

        element = Part(number=numbering.num, index=numbering.idx, text=title)
        parent.add(element)
        get_next(part, element)

KEYS = ['osa', 'peatykk', 'jagu', 'paragrahv', 'loige', 'alampunkt']
PARSERS = [parse_parts, parse_chapters, parse_sections, parse_paragraphs, parse_subparagraphs, parse_points]

def get_next(xml_dict, parent: Element):
    for idx, key in enumerate(KEYS):
        if key in xml_dict:
            PARSERS[idx](get_list(xml_dict, key), parent)

### create law data classes

In [152]:
from collections import deque
from collections.abc import Iterable

def flatten(xs, max_tokens):
    # from https://stackoverflow.com/a/2158532
    for x in xs:
        if isinstance(x, Iterable) and not isinstance(x, TEXTELEM):
            yield from flatten(x)
        else:
            assert(num_tokens(x[0]) <= max_tokens)
            yield x[0] # only print the text and not the element itself

documents = {}
for idx, row in df.iterrows():
    # print('--------------------------------------')
    # print(row['title'])
    doc = Document(text=row['title'])
    documents[row['title']] = doc
    get_next(xml_preprocess(row['xml'])['oigusakt']['sisu'], doc)
    max_tokens = 1001
    doc_text = doc.get_text(max_tokens)
len(documents)
# there's at least one law that isn't properly parsed and some have bad xml formatting, but it should get every paragraph otherwise

371

In [206]:
import pickle
with open(f'docs.pickle', 'wb') as file:
    pickle.dump(documents, file) 

In [179]:
data = []
for law, doc in documents.items():
    doc_text: list[TEXTELEM] = doc.get_text(max_tokens)
    for te in doc_text:
        refs = ''
        for ref in te.elem.get_references():
            if ref and len(ref) > 0:
                refs += f'{ref["choices"][0]["message"]["content"]}\n'
        if len(refs) == 0: refs = None
        data.append([law, te.elem.number, te.elem.index, te.text, refs])
        if te.elem.refs is not None:
            print(te.elem.refs)
final_df = pd.DataFrame(data, columns=['law', 'para', 'subpara', 'text', 'refs'])
final_df

{
  "id": "chatcmpl-7R6IqCdieyTSEdqsb4hSoDZMBnfyt",
  "object": "chat.completion",
  "created": 1686693440,
  "model": "gpt-3.5-turbo-0301",
  "usage": {
    "prompt_tokens": 119,
    "completion_tokens": 6,
    "total_tokens": 125
  },
  "choices": [
    {
      "message": {
        "role": "assistant",
        "content": "self;43;8;"
      },
      "finish_reason": "stop",
      "index": 0
    }
  ]
}


Unnamed: 0,law,para,subpara,text,refs
0,Abieluvararegistri seadus,1,,§1. Abieluvararegister (1) Abieluvararegister ...,
1,Abieluvararegistri seadus,2,,§2. Abieluvararegistri vastutav ja volitatud t...,
2,Abieluvararegistri seadus,2,,(1_4) Registrite ja Infosüsteemide Keskus tegu...,
3,Abieluvararegistri seadus,6,,§6. Abieluvararegistri andmetega tutvumine (1)...,
4,Abieluvararegistri seadus,7,,§7. Kande õiguslik tähendus (4) Kandega ei ole...,
...,...,...,...,...,...
29798,Üürivaidluse lahendamise seadus,22,,§22. Otsuse jõustumine (1) Komisjoni otsus jõu...,
29799,Üürivaidluse lahendamise seadus,23,,§23. Otsuse jõustumise tagajärjed (1) Komisjon...,
29800,Üürivaidluse lahendamise seadus,24,,§24. Otsuse täitmine Komisjoni jõustunud otsu...,
29801,Üürivaidluse lahendamise seadus,26,,§26. Elamuvaidluskomisjoni lõpetamine ja ümber...,


### chatgpt parse text references test for vvs

In [None]:
# documents[list(documents.keys())[150]]
documents['Vabariigi Valitsuse seadus']

In [177]:
# documents[list(documents.keys())[150]].fetch_references()
documents['Vabariigi Valitsuse seadus'].fetch_references()

In [None]:
# documents[list(documents.keys())[150]]
documents['Vabariigi Valitsuse seadus']

### get embeddings for vvs

In [None]:
# final_df[~final_df['refs'].isna()]
# def get_embedding(text: str, model="text-embedding-ada-002") -> list[float]:
#     return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]

# embeddings = []
# for idx, row in final_df[final_df['law'] == 'Vabariigi Valitsuse seadus'].iterrows():
#     # print(row['text'])
#     embeddings.append(get_embedding(row['text']))
# print(embeddings[0])

In [202]:
# final_df[final_df['law'] == 'Vabariigi Valitsuse seadus']['embedding'] = embeddings
# final_df['embedding'] = None
# i = 0
# for idx, row in final_df[final_df['law'] == 'Vabariigi Valitsuse seadus'].iterrows():
#     final_df.at[idx, 'embedding']= embeddings[i]
#     i += 1

# final_df[final_df['law'] == 'Vabariigi Valitsuse seadus']

Unnamed: 0,law,para,subpara,text,refs,embedding
25361,Vabariigi Valitsuse seadus,1,,§1. Vabariigi Valitsuse pädevus (1) Vabariigi ...,,"[0.015912197530269623, -0.04427521303296089, -..."
25362,Vabariigi Valitsuse seadus,2,,§2. Vabariigi Valitsuse asukoht Vabariigi Val...,,"[-0.003098118118941784, -0.029830319806933403,..."
25363,Vabariigi Valitsuse seadus,3,,§3. Vabariigi Valitsuse liikmed (1) Vabariigi ...,,"[0.0045783682726323605, -0.04495125263929367, ..."
25364,Vabariigi Valitsuse seadus,3,1,§3_1. Vabariigi Valitsuse liikme juurdepääs ri...,,"[0.0057618338614702225, -0.03222525119781494, ..."
25365,Vabariigi Valitsuse seadus,4,,§4. Vabariigi Valitsuse liikmete tööülesannete...,,"[-0.008184646256268024, -0.03359680250287056, ..."
...,...,...,...,...,...,...
25499,Vabariigi Valitsuse seadus,107,2,§107_2. Käesoleva seaduse § 43 lõike 8 rakenda...,self;43;8;\nself;43;8;\n,"[-0.026014698669314384, -0.016599023714661598,..."
25500,Vabariigi Valitsuse seadus,107,3,§107_3. Ministrite ametinimetuste asendamine (...,,"[0.0006198450573720038, -0.020709000527858734,..."
25501,Vabariigi Valitsuse seadus,107,4,§107_4. Põllumajandusministeeriumi nime asenda...,,"[-0.00911065936088562, -0.03169141337275505, -..."
25502,Vabariigi Valitsuse seadus,107,5,§107_5. Tähtajatu töölepingu sõlminud valitsus...,self;43;1_1; \nself;43;1_1;\n,"[-0.01896170899271965, -0.006719377823174, -0...."


In [254]:
final_df[final_df['law'] == 'Vabariigi Valitsuse seadus'].to_csv('embeds 1001 vvs.csv')

In [255]:
import numpy as np
from ast import literal_eval
final_df = pd.read_csv('embeds 1001 vvs.csv', index_col=0)
final_df.replace('nan', np.nan, inplace=True)

final_df.loc[final_df['embedding'].notna(), 'embedding'] = final_df.loc[final_df['embedding'].notna(), 'embedding'].apply(literal_eval)
# final_df[final_df['law'] == 'Vabariigi Valitsuse seadus']
final_df

Unnamed: 0,law,para,subpara,text,refs,embedding
25361,Vabariigi Valitsuse seadus,1,,§1. Vabariigi Valitsuse pädevus (1) Vabariigi ...,,"[0.015912197530269623, -0.04427521303296089, -..."
25362,Vabariigi Valitsuse seadus,2,,§2. Vabariigi Valitsuse asukoht Vabariigi Val...,,"[-0.003098118118941784, -0.029830319806933403,..."
25363,Vabariigi Valitsuse seadus,3,,§3. Vabariigi Valitsuse liikmed (1) Vabariigi ...,,"[0.0045783682726323605, -0.04495125263929367, ..."
25364,Vabariigi Valitsuse seadus,3,1.0,§3_1. Vabariigi Valitsuse liikme juurdepääs ri...,,"[0.0057618338614702225, -0.03222525119781494, ..."
25365,Vabariigi Valitsuse seadus,4,,§4. Vabariigi Valitsuse liikmete tööülesannete...,,"[-0.008184646256268024, -0.03359680250287056, ..."
...,...,...,...,...,...,...
25499,Vabariigi Valitsuse seadus,107,2.0,§107_2. Käesoleva seaduse § 43 lõike 8 rakenda...,self;43;8;\nself;43;8;\n,"[-0.026014698669314384, -0.016599023714661598,..."
25500,Vabariigi Valitsuse seadus,107,3.0,§107_3. Ministrite ametinimetuste asendamine (...,,"[0.0006198450573720038, -0.020709000527858734,..."
25501,Vabariigi Valitsuse seadus,107,4.0,§107_4. Põllumajandusministeeriumi nime asenda...,,"[-0.00911065936088562, -0.03169141337275505, -..."
25502,Vabariigi Valitsuse seadus,107,5.0,§107_5. Tähtajatu töölepingu sõlminud valitsus...,self;43;1_1; \nself;43;1_1;\n,"[-0.01896170899271965, -0.006719377823174, -0...."


In [273]:
from scipy import spatial  # for calculating vector similarities for search
# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]), row['refs'])
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses, refs = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n], refs[:top_n]
# examples
df_test = final_df[final_df['law'] == 'Vabariigi Valitsuse seadus']
question = "Millisel juhul pole ministri mitme ministeeriumi juhtimine muudatus valitsuse koosseisus?"
print(question)
print('---')
strings, relatednesses, refs = strings_ranked_by_relatedness(question, df_test, top_n=15)
i = 1
for string, relatedness, refs in zip(strings, relatednesses, refs):
    print(f"{i}: {relatedness=:.3f}")
    print(f"text: {string}")
    if (isinstance(refs, str)):
        print(f'refs: {refs}'.replace('\n', ' '))
    i+=1

Millisel juhul pole ministri mitme ministeeriumi juhtimine muudatus valitsuse koosseisus?
---
1: relatedness=0.876
text: §7. Muudatused Vabariigi Valitsuse koosseisus (1) Muudatused ametisse astunud Vabariigi Valitsuse koosseisus on:1) ministri ametist vabastamine;2) uue ministri ametisse nimetamine;3) ministri nimetamine juhtima mitut ministeeriumi, välja arvatud käesoleva seaduse §-s 15 nimetatud juhul;4) mitut ministeeriumi juhtima nimetatud ministri vabastamine ühe ministeeriumi juhtimisest, välja arvatud käesoleva seaduse §-s 15 nimetatud juhul.(2) Minister, kes on Vabariigi Valitsuse antud koosseisus ametisse astudes andnud ametivande, ei tee seda uuesti tema nimetamisel juhtima teist ministeeriumi.(3) Muudatuse ametisse nimetatud Vabariigi Valitsuse koosseisus teeb Vabariigi President peaministri ettepaneku saamisest kolme päeva jooksul. Kui Riigikogu on avaldanud ministrile umbusaldust, vabastab Vabariigi President ministri ametist viivitamata pärast Riigikogu esimehelt asjakoh