## Jakub Darul
> NLP 2019
>
> Lab 1: Regex


#### <span style="color:orange">Better to look at this notebook locally </span>- github doesn't display colored prints (termcolor)  and tables are displayed poorly

In [19]:
import regex as re
from functional import seq
from functional.pipeline import Sequence
from fn import _
from collections import namedtuple
from regex.regex import Match, Pattern
from typing import List, Dict, Tuple, Optional
from termcolor import colored
import os
from math import inf

In [21]:
act_dir = os.environ["HOME"]+"/tmp/nlp/ustawy"
act_names = seq(os.listdir(act_dir))

acts = act_names.map(lambda fn : open("{}/{}".format(act_dir,fn)).read())

In [22]:
def match_with_index(pattern ,text:str)-> List[Tuple[int,int,str]]:
    return [(m.start(), m.end(),m.captures()[0]) for m in re.finditer(pattern, text)]


def print_highlighted(
    text:str,
    matches_groups:List[List[Tuple[int,int]]],
    colors:List[Tuple[str,Optional[str]]]
):
    if len(matches_groups) != len(colors):
        raise "There should be the same number of matches groups and colors"
    
    MatchedEntry=namedtuple('MatchedEntry','beg end color on_color')
    
    # Seq[MatchedEntry]
    mg = seq(matches_groups)\
        .zip(colors)\
        .flat_map(lambda gc: seq(gc[0])\
            .map(lambda x : MatchedEntry(beg=x[0],end=x[1],color=gc[1][0], on_color = gc[1][1]))
                 )\
        .order_by(_.beg)
        
    # Will duplicate some matches but that's not the point
    beg = 0
    for matched in mg:
        print(text[beg:matched.beg],end="") 
        print(colored(text[matched.beg:matched.end],color= matched.color, on_color= matched.on_color),end="")
        beg = matched.end
    print(text[beg:])

In [23]:
dzu_pat = re.compile("Dz\.?\s?U.?",re.I)
year_pat = re.compile("(?P<year>((19\d)|(200)|(201))\d{1})")
nr_pat = re.compile("Nr\s+(?P<nr>\d+)",re.I)
pos_pat = re.compile("poz\.\s+(?P<poz>\d+((-|,)\d+)*)",re.I)

#Entry extractors
footer_ext_pat = re.compile(r"\[\d+\]\)?\s+Zmiany(\w|\s|\n)*?w\s+Dz\.*\s*U\.?(?P<foot_entry>(.|\s|\n)*?)(?=(\[|\Z))")
art_ext_pat=re.compile(r"(U|u)staw\w*\s+(z\s+dnia\s+)+\d+\s+\w+\s+(?P<year>\d{4})(\s|\n)*r\.?\s*(–|\-)*\s*(?P<title>(\w|\s|\")*)\((?P<art_entry>(.|\n)*?)\)")

# 1. External references

In [24]:
#Show marking of interesting things. Things are matched here just for reference, extracting will happen differently.
print_highlighted(
    acts.head(),
    [
        match_with_index(dzu_pat,acts.head()),
        match_with_index(year_pat,acts.head()),
        match_with_index(nr_pat,acts.head()),
        match_with_index(pos_pat,acts.head()),
    ],
    [
        ("blue","on_grey"),
        ("green","on_grey"),
        ("red","on_grey"),
        ("yellow","on_grey"),
    
    ]
)





[40m[34mDz.U.[0m z [40m[32m2001[0m r. [40m[31mNr 81[0m, [40m[33mpoz. 874[0m
                                        
                                        
                                     USTAWA
                           z dnia 21 czerwca  [40m[32m2001[0m r.
                                        
                  o zmianie ustawy o Państwowej Straży Pożarnej
                                        
                                        
                                     Art. 1.
W ustawie z dnia 24 sierpnia [40m[32m1991[0m r. o Państwowej Straży Pożarnej ([40m[34mDz.U.[0m [40m[31mNr 88[0m,
[40m[33mpoz. 400[0m z [40m[32m1992[0m r. [40m[31mNr 21[0m, [40m[33mpoz. 86[0m i [40m[31mNr 54[0m, [40m[33mpoz. 254[0m, z [40m[32m1994[0m r. [40m[31mNr 53[0m, [40m[33mpoz. 214[0m,
z [40m[32m1995[0m r. [40m[31mNr 4[0m, [40m[33mpoz. 17[0m i [40m[31mNr 34[0m, [40m[33mpoz. 163[0m, z [40m[32m1996[0m r. [40m[31mNr 106

In [25]:
ExtRef= namedtuple("ExtRef", "year nr poz title")

def extract_group_name(match:Match) -> str:
    return list(match.groupdict().keys())[0]

#Assumption: entry without year  takes year and title from outside (before parenthesis)
def art_entry_extractor(title:str,default_year:str,entry:str)-> List[ExtRef]:
    matches = seq([year_pat,nr_pat,pos_pat])\
    .flat_map(lambda pat: list(pat.finditer(entry)))\
    .order_by(lambda x: x.start())
    
    year_found = False
    year = default_year
    nr = None
    res = []    
    for match in matches:
        name = extract_group_name(match)
        if name == "year":
            year_found = True
            year = match.groupdict()[name]
        elif name =="nr":
            nr = match.groupdict()[name]
        elif name =="poz":
            poz = match.groupdict()[name]
            res.append(ExtRef(year=year,nr=nr,poz=poz, title=None if year_found else title))
        else:
            raise "Disaster here; Match name not found ion art_entry_extractor"
    return res


def art_match_extractor(match:Match)-> List[ExtRef]:
    d = match.groupdict()
    title = re.sub("[\s\n]+"," ",d["title"]).strip()
    return art_entry_extractor(title,d["year"],d["art_entry"])

    
def foot_match_extractor(match:Match)-> List[ExtRef]:
    return art_entry_extractor(None,None, match.groupdict()["foot_entry"])

In [26]:
def ext_ref_extractor(text:str) -> Sequence: #Sequence[ExtRef]:
    art_matches = seq(list(art_ext_pat.finditer(text,timeout=5))).flat_map(art_match_extractor)
    footer_matches = seq(list(footer_ext_pat.finditer(text,timeout=5))).flat_map(foot_match_extractor)
    return  art_matches + footer_matches



CountedExtRef= namedtuple("CountedExtRef", "count year nr poz title")
def ext_ref_counter(refs:Sequence)-> Sequence:#Sequence[ExtRef] -> Sequence[CountedExternalRef]
    def counted_from_ext_ref(count:int,ref:ExtRef) -> CountedExtRef:
        return CountedExtRef(count=count,year=ref.year,nr=ref.nr,poz=ref.poz,title=ref.title)
    
    def count_and_select_best(t:Tuple[str,Sequence])-> CountedExtRef:
        if len(t[1]) == 0:
            raise "Aggregated tuple shouldn't be empty"
        with_title = seq(t[1]).find(lambda ref:ref.title != None)
        return counted_from_ext_ref(len(t[1]), t[1][0] if with_title is None else with_title)
        
    def flatten_range(ref:ExtRef)-> List[ExtRef]:
        def builder(poz:str):
            return ExtRef(year=ref.year, nr = ref.nr, title= ref.title, poz= poz)
        if "–" in ref.poz or "-" in ref.poz:
            pozs = re.compile("-|–").split(ref.poz)
            if len(pozs) ==2:
                if seq(pozs).for_all(lambda x:x.isdigit()):
                    p1 = int(pozs[0]); p2 = int(pozs[1])
                    return seq(range(min(p1,p2),max(p1,p2)+1)).map(str).map(builder).to_list()
                else:
                    return [builder(pozs[0]),builder(pozs[1])]
            else:
                return [ref]
        else: 
            return seq(ref.poz.split(",")).map(lambda x: x.strip()).map(builder).to_list()
        
    return refs\
        .filter(lambda ref : ref.year != None and ref.poz != None)\
        .flat_map(flatten_range)\
        .group_by(lambda ref : ref.year+ "-"+ref.poz).map(count_and_select_best)

def ext_ref_counter_aggregate(refs:Sequence)-> CountedExtRef:#Sequence[CountedExtRef] -> Sequence[CountedExternalRef]
    
    def count_and_select_best(t:Tuple[str,Sequence])-> CountedExtRef:
        refs_2_count = seq(t[1])
        count_sum = refs_2_count.map(lambda ref:ref.count).sum()
        with_title = refs_2_count.find(lambda ref:ref.title != None)
        res = refs_2_count.head() if with_title is None else with_title
        return CountedExtRef(count = count_sum,year = res.year, nr = res.nr, poz= res.poz,title= res.title)
        
    return refs\
        .group_by(lambda ref : ref.year+ "-"+ref.poz).map(count_and_select_best)

def display_seq(sequence:Sequence,rows:int)-> None:
    sequence._repr_html_= lambda :sequence.tabulate(rows,tablefmt='html')
    display(sequence)
    sequence._repr_html_= lambda :sequence.tabulate(10,tablefmt='html')

def ext_refs(act:str) -> Sequence: #Sequence[CountedExtRef]
    return ext_ref_counter(ext_ref_extractor(act))\
    .order_by(lambda x : x.poz)\
    .order_by(lambda x : int(x.year))\
    .order_by(lambda x: 1/x.count)
    
#Possible improvement - extract title from existing articles, not just from text before referencing
def global_refs(acts:Sequence)-> Sequence : #Sequence[str] -> Sequence[CountedExtRef]
    return ext_ref_counter_aggregate(acts.map(ext_ref_extractor).flat_map(ext_ref_counter))\
        .order_by(lambda x : x.poz)\
        .order_by(lambda x : int(x.year))\
        .order_by(lambda x: 1/x.count)
        
        
display_seq(
    global_refs(acts),
    30
)



count,year,nr,poz,title
737,1998,106,668,
482,1996,106,496,o Służbie Więziennej
372,1997,121,770,Kodeks celny
367,2000,12,136,o zmianie niektórych ustaw związanych z funkcjonowaniem administracji publicznej
292,1997,88,554,Przepisy wprowadzające Kodeks karny
274,1997,28,153,o powszechnym ubezpieczeniu zdrowotnym
234,1990,34,198,o podziale zadań i kompetencji określonych w ustawach szczególnych pomiędzy organy gmin a organy administracji rządowej oraz o zmianie niektórych ustaw
232,2000,120,1268,
226,1998,162,1118,o systemie ubezpieczeń społecznych
204,1997,141,943,o działach administracji rządowej


# 2. Internal references

In [27]:
single_regex = r"(?P<single_ust>\d+\p{L}{0,2}\b)"
i_regex = r"(?P<i_ust_1>\d+\p{L}{0,3})(\s*(,|i|oraz)\s+)(?P<i_ust_2>\d+\p{L}{0,3})"
dash_regex= r"(?P<dash_ust_1>\d+\p{L}{0,3})(\s*(\-|do|–)\s*(?P<dash_ust_2>\d+\p{L}{0,3}))"

ustaw_regex = "ust\.*[\s\n]("+ i_regex +"|"+ dash_regex+ "|"+ single_regex+")"
art_ust_regex = r"(art\.*[\s\n]*(?P<art>\d+\p{L}{0,3})[\s\n]+(w[\s\n]+)?)?"+ustaw_regex
art_ust_ust_regex = r"({}|{})".format(art_ust_regex,ustaw_regex)
art_ust_pat= re.compile(art_ust_regex,re.I)

In [28]:
act = open("{}/{}".format(act_dir,"2004_1001.txt")).read()
art_ust_matches = match_with_index(art_ust_pat,act)

print_highlighted(
    act,
    [
        art_ust_matches,
    ],
    [
        ("blue","on_grey"),
    ]
)




Tekst ustawy przyjęty przez Senat bez poprawek

 
 
 
USTAWA
z dnia 26 września 2014 r.
 
o zmianie ustawy o podatku dochodowym od osób
fizycznych 
oraz niektórych innych ustaw[1])
 
 
Art. 1. 
W ustawie z dnia 26 lipca 1991 r. o podatku
dochodowym od osób fizycznych (Dz. U. z 2012 r. poz. 361, z późn. zm.[2])) wprowadza się następujące zmiany:
1)   w [40m[34mart. 35 ust. 10[0m otrzymuje brzmienie:
„10. Płatnicy
stypendiów, o których mowa w [40m[34mart. 21 ust. 1[0m pkt 40b, są obowiązani w terminie
do końca lutego roku następującego po roku podatkowym, z zastrzeżeniem [40m[34mart.
45ba ust. 4[0m, sporządzić informację o wysokości wypłaconego stypendium, według
ustalonego wzoru, i przesłać ją podatnikowi oraz urzędowi skarbowemu, którym
kieruje naczelnik urzędu skarbowego właściwy według miejsca zamieszkania
podatnika, z zastrzeżeniem art. 37.”;
2)   w [40m[34mart. 37 ust. 3[0m otrzymuje brzmienie:
„3. Roczne obliczenie
podatku, o którym mowa w [40m[34must. 1[0m, płat

In [29]:
InterRef = namedtuple("InterRef", "art ust")
def art_ust_extractor(match:Match) -> List[InterRef]:
    gd = match.groupdict()
    #small sanity checks
    def extract_usts():
        not_nones = seq(gd.items()).filter(lambda x: x[1] is not None)
        keys = not_nones.map(lambda x: x[0])

        if keys.count(lambda x:x.startswith("dash")) != 2\
        and keys.count(lambda x:x.startswith("i_")) != 2\
        and keys.count(lambda x:x == "single_ust") != 1:
            raise Exception("Hey, match dict not ok :{} {}".format(gd,keys))
        
        if gd["dash_ust_1"] is not None:
            du1 = gd["dash_ust_1"]
            du2 = gd["dash_ust_2"]
            if du1.isdigit() and du2.isdigit():
                du1= int(du1); du2= int(du2)
                #Possible improvement(not sure how though): now if we have `ust. 1-3` we just include 1,2,3, but not possibly 1, 2, 2a, 2b, 3
                return seq(range(min(du1,du2),max(du1,du2)+1)).map(str).to_list()
            else:
                return [du1,du2]
        elif gd["i_ust_1"] is not None:
            return [gd["i_ust_1"],gd["i_ust_2"]]
        elif gd["single_ust"] is not None:
            return [gd["single_ust"]]
        else:
            raise "No ust was matched"
    art = gd["art"]
    return seq(extract_usts()).map(lambda ust : InterRef(art = art, ust = ust)).to_list()
  
CountedInterRef = namedtuple("CountedInterRef", "count art ust")
def inter_ref_counter(ref:Sequence)-> Sequence: # Seq[InterRef, CountedInterRef]
    return ref.group_by(lambda x: x.ust if x.art is None else x.art + x.ust)\
    .map(lambda refs:CountedInterRef(count = len(refs[1]),art = refs[1][0].art, ust = refs[1][0].ust))
    
def internal_refs(act:str)-> Sequence: #Sequence [CountedInterRef] 
    return inter_ref_counter(
        seq(list(art_ust_pat.finditer(act))).flat_map(art_ust_extractor)
    ).order_by(lambda x: 1/x.count)

### Counted Internal references 
> aggregated by (art, ust) ( or only ust if art is not referenced): 

In [33]:
def count_and_seq(refs:Sequence) -> Sequence:
    count = refs.map(lambda x: x.count).sum()
    return (count,refs)


all_internal = acts.map(internal_refs).map(count_and_seq).order_by(lambda x: 1/x[0] if x[0] != 0 else inf)

internal_to_display = all_internal.take(5).zip(act_names)
for bill, name  in  internal_to_display.to_list():
    (count,refs) = bill
    print("{}, count: {}".format(name,count))
    display_seq(
        refs,
        12
    )

2001_874.txt, count: 1938


count,art,ust
275,,1
213,,2
191,,3
122,,4
70,,5
50,,6
19,,7
19,,1a
19,,2a
16,,3a


1996_583.txt, count: 897


count,art,ust
142,,1
69,,2
46,,3
30,,5
24,13.0,3
22,,4
19,,8
16,48.0,1
16,,10
14,,11


2003_1853.txt, count: 894


count,art,ust
13,,1
8,13.0,1
6,,2
5,27.0,2
5,,5
5,14.0,3
5,,6
4,12.0,1
4,9.0,2
4,10.0,4


1997_753.txt, count: 840


count,art,ust
174,,1
120,,2
90,,3
58,,4
39,,5
21,,6
14,6.0,1
13,139.0,1
9,,8
8,6.0,2


2000_440.txt, count: 754


count,art,ust
180,,1
83,,2
70,,3
36,,4
25,46.0,4
15,,5
13,51.0,1
11,181.0,1
9,40.0,1
9,,6


# 3 Ustawa count

In [34]:
ustawa_pat = re.compile(r"\b(ustawa|ustawy|ustawie|ustawę|ustawą|ustawo|ustaw|ustawom|ustawami|ustawach)\b",flags= re.I)

In [35]:
acts.map(lambda act: len(list(ustawa_pat.finditer(act)))).sum()

25092