In [1]:
import regex as re
from functional import seq,pseq
from functional.pipeline import Sequence
from fn import _
from collections import namedtuple
from regex.regex import Match, Pattern
from typing import List, Dict, Tuple, Optional
from termcolor import colored
import os


In [2]:
act_dir = os.environ["HOME"]+"/tmp/nlp/ustawy"
acts = pseq(os.listdir(act_dir)).map(lambda fn : open("{}/{}".format(act_dir,fn)).read())

In [5]:
def match_with_index(pattern ,text:str)-> List[Tuple[int,int,str]]:
    return [(m.start(), m.end(),m.captures()[0]) for m in re.finditer(pattern, text)]


def print_highlighted(
    text:str,
    matches_groups:List[List[Tuple[int,int]]],
    colors:List[Tuple[str,Optional[str]]]
):
    if len(matches_groups) != len(colors):
        raise "There should be the same number of matches groups and colors"
    
    MatchedEntry=namedtuple('MatchedEntry','beg end color on_color')
    
    # Seq[MatchedEntry]
    mg = seq(matches_groups)\
        .zip(colors)\
        .flat_map(lambda gc: seq(gc[0])\
            .map(lambda x : MatchedEntry(beg=x[0],end=x[1],color=gc[1][0], on_color = gc[1][1]))
                 )\
        .order_by(_.beg)
        
    # Will duplicate some matches but that's not the point
    beg = 0
    for matched in mg:
        print(text[beg:matched.beg],end="") 
        print(colored(text[matched.beg:matched.end],color= matched.color, on_color= matched.on_color),end="")
        beg = matched.end
    print(text[beg:])

In [25]:
dzu_pat = re.compile("Dz\.?\s?U.?")
year_pat = re.compile("(?P<year>((19\d)|(200)|(201))\d{1})")
nr_pat = re.compile("Nr\s+(?P<nr>\d+)")
pos_pat = re.compile("poz\.\s+(?P<poz>\d+((-|,)\d+)*)")

#Entry extractors
footer_ext_pat = re.compile("\[\d+\]\)?\s+Zmiany(\w|\s|\n)*?w\s+Dz\.*\s*U\.?(?P<foot_entry>(.|\s|\n)*?)(?=(\[|\Z))")
art_ext_pat=re.compile("(U|u)staw\w*\s+(z\s+dnia\s+)+\d+\s+\w+\s+(?P<year>\d{4})(\s|\n)*r\.?\s*(–|\-)*\s*(?P<title>(\w|\s|\")*)\((?P<art_entry>(.|\n)*?)\)")

# 1. External references

In [33]:
#Show marking of interesting things
print_highlighted(
    acts.head(),
    [
        match_with_index(dzu_pat,acts.head()),
        match_with_index(year_pat,acts.head()),
        match_with_index(nr_pat,acts.head()),
        match_with_index(pos_pat,acts.head()),
    ],
    [
        ("blue","on_grey"),
        ("green","on_grey"),
        ("red","on_grey"),
        ("yellow","on_grey"),
    
    ]
)





[40m[34mDz.U.[0m z [40m[32m2000[0m r. [40m[31mNr 50[0m, [40m[33mpoz. 581[0m
                                                                              
                                                                              
                                                                              
                                                                              
                                    USTAWA
                            z dnia 26 maja [40m[32m2000[0m r.
                                       
  o zmianie ustawy o niektórych formach popierania budownictwa mieszkaniowego
          oraz o zmianie ustawy o pracowniczych ogrodach działkowych
                                       
                                       
                                    Art. 1.
W ustawie z dnia 26 października [40m[32m1995[0m r. o niektórych formach popierania
budownictwa mieszkaniowego ([40m[34mDz.U.[0m [40m[31mNr 133[0m, [40m[33mpoz. 6

In [26]:
ExtRef= namedtuple("ExtRef", "year nr poz title")

def extract_group_name(match:Match) -> str:
    return list(match.groupdict().keys())[0]

#Assumption: entry without year  takes year and title from outside
def art_entry_extractor(title:str,default_year:str,entry:str)-> List[ExtRef]:
    matches = seq([year_pat,nr_pat,pos_pat])\
    .flat_map(lambda pat: list(pat.finditer(entry)))\
    .order_by(lambda x: x.start())
    
    year_found = False
    year = default_year
    nr = None
    res = []    
    for match in matches:
        name = extract_group_name(match)
        if name == "year":
            year_found = True
            year = match.groupdict()[name]
            if type(year) != str:
                raise "Oh no! Year is supposed to be str."
        elif name =="nr":
            nr = match.groupdict()[name]
            if type(nr) != str:
                raise "Oh no! Nr is supposed to be str."
        elif name =="poz":
            poz = match.groupdict()[name]
            if type(poz) != str:
                raise "Oh no! Poz is supposed to be str."
            res.append(ExtRef(year=year,nr=nr,poz=poz, title=None if year_found else title))
        else:
            raise "Disaster here; Match name not found ion art_entry_extractor"
    return res


def art_match_extractor(match:Match)-> List[ExtRef]:
    d = match.groupdict()
    title = re.sub("[\s\n]+"," ",d["title"]).strip()
    return art_entry_extractor(title,d["year"],d["art_entry"])

    
def foot_match_extractor(match:Match)-> List[ExtRef]:
    return art_entry_extractor(None,None, match.groupdict()["foot_entry"])

In [52]:
def ext_ref_extractor(text:str) -> Sequence: #Sequence[ExtRef]:
    art_matches = seq(list(art_ext_pat.finditer(text,timeout=5))).flat_map(art_match_extractor)
    footer_matches = seq(list(footer_ext_pat.finditer(text,timeout=5))).flat_map(foot_match_extractor)
    return  art_matches + footer_matches



CountedExtRef= namedtuple("CountedExtRef", "count year nr poz title")
def ext_ref_counter(refs:Sequence)-> CountedExtRef:#Sequence[ExtRef] -> Sequence[CountedExternalRef]
    def counted_from_ext_ref(count:int,ref:ExtRef) -> CountedExtRef:
        return CountedExtRef(count=count,year=ref.year,nr=ref.nr,poz=ref.poz,title=ref.title)
    
    def count_and_select_best(t:Tuple[str,Sequence])-> CountedExtRef:
        if len(t[1]) == 0:
            raise "Aggregated tuple shouldn't be empty"
        with_title = seq(t[1]).find(lambda ref:ref.title != None)
        return counted_from_ext_ref(len(t[1]), t[1][0] if with_title is None else with_title)
        
    return refs\
        .filter(lambda ref : ref.year != None and ref.poz != None)\
        .group_by(lambda ref : ref.year+ "-"+ref.poz).map(count_and_select_best)

def ext_ref_counter_aggregate(refs:Sequence)-> CountedExtRef:#Sequence[CountedExtRef] -> Sequence[CountedExternalRef]
    
    def count_and_select_best(t:Tuple[str,Sequence])-> CountedExtRef:
        if len(t[1]) == 0:
            raise "Aggregated tuple shouldn't be empty"
            
        t1seq = seq(t[1])
        count_sum = t1seq.map(lambda ref:ref.count).sum()
        with_title = t1seq.find(lambda ref:ref.title != None)
        res = t1seq.head() if with_title is None else with_title
        return CountedExtRef(count = count_sum,year = res.year, nr = res.nr, poz= res.poz,title= res.title)
        
    return refs\
        .group_by(lambda ref : ref.year+ "-"+ref.poz).map(count_and_select_best)


def display_seq(sequence:Sequence,rows:int)-> None:
    sequence._repr_html_= lambda :sequence.tabulate(rows,tablefmt='html')
    display(sequence)
    sequence._repr_html_= lambda :sequence.tabulate(10,tablefmt='html')

#Possible improvement - extract title from existing articles, not just from text before referencing
res = ext_ref_counter_aggregate(acts.map(ext_ref_extractor).flat_map(ext_ref_counter))\
    .order_by(lambda x : x.poz)\
    .order_by(lambda x : int(x.year))\
    .order_by(lambda x: 1/x.count)
display_seq(
    res,
    50
)



count,year,nr,poz,title
737,1998,106.0,668,
482,1996,106.0,496,o Rządowym Centrum Studiów Strategicznych
372,1997,121.0,770,""" Kodeks celny"
367,2000,12.0,136,o zmianie niektórych ustaw związanych z funkcjonowaniem administracji publicznej
292,1997,88.0,554,Przepisy wprowadzające Kodeks karny
274,1997,28.0,153,o powszechnym ubezpieczeniu zdrowotnym
234,1990,34.0,198,o podziale zadań i kompetencji określonych w ustawach szczególnych pomiędzy organy gmin a organy administracji rządowej oraz o zmianie niektórych ustaw
232,2000,120.0,1268,
226,1998,162.0,1118,o systemie ubezpieczeń społecznych
204,1997,141.0,943,o działach administracji rządowej


# 2. Internal references

#### TODO

# 3 Ustawa count

In [45]:
ustawa_pat = re.compile("(ustawa|ustawy|ustawie|ustawę|ustawą|ustawo|ustaw|ustawom|ustawami|ustawach)",flags= re.I)

In [50]:
acts.map(lambda act: len(list(ustawa_pat.finditer(act)))).sum()

25940