# Regular expressions (aka regexps)
## Solution of https://github.com/apohllo/nlp/blob/master/1-regexp.md

### Task 1

In [763]:
from os import listdir
from os.path import isfile, join
from typing import List, Set, Dict, Tuple, Optional
import regex

In [764]:
reference_dict = {}

In [765]:
class BillRecord:
    def __init__(self, year, position, title="", number=None ):
        self.year = year
        self.position = position
        self.title = title
        self.number = number
        self.occurrences = 1
    
    def increment_occurrences(self):
        self.occurrences +=1
        
    def __str__(self):
        return f"year:{self.year},\n position:{self.position}, \n title:{self.title}, \n occurences:{self.occurrences}, \n"

In [766]:
def load_bills(directory_path: str):
    files = [file for file in listdir(directory_path) if isfile(join(directory_path, file))]
    for file in files:
        with open(f"{directory_path}/{file}") as input_file:
            input_bill = input_file.read()
            count_bill(input_bill)

In [767]:
def find_references(bill_text: str):
    return regex.findall('[Uu]staw[\w\.]*[\s]*z [ \t\w\n \. \, \; \[\]\-\+\*\/\"\'\?\>\;\!\”\:]*\(Dz\.[ ]*U\.[ ]*[ \t\w\n \. \, \; \[\]\-\+\*\/]*\)', bill_text)


In [768]:
def extract_references(reference_text: str):
    extracted = regex.findall('\(Dz\.[ ]*U\.[ ]*[ \t\w\n \. \, \; \[\]\-\+\*\/]*\)', reference_text)
    if extracted == []:
        return None
    return extracted[0]

In [769]:
def divide_references(references:  str): 
    divided = regex.findall('\([ \t\w\n \. \, \; \[\]\-\+\*\/\"\'\?\>\;\!\”]*? z|[0-9]{4}[ \t\w\n \. \, \; \[\]\-\+\*\/\"\'\?\>\;\!\”]*? z |[0-9]{4}[ \t\w\n \. \, \; \[\]\-\+\*\/\"\'\?\>\;\!\”]*?\)|\([ \t\w\n \. \, \; \[\]\-\+\*\/\"\'\?\>\;\!\”]*\)', references)
    if divided!=[] and get_position(divided[0]) == []:
        return divided[1:]
    return divided

In [770]:
def get_year_from_reference(reference: str): 
    return regex.findall('[0-9]{4}[\s]*r\.', reference)

In [771]:
def get_number_and_position_from_reference(reference: str):
    return regex.findall(r"Nr[ \s]*[0-9]*[\,\.\/  \s]*poz\.[\s]*[0-9\s\'i\.\{oraz}]*|poz\.[\s]*[0-9\s\'i\.\{oraz}]*", reference)

In [772]:
def get_number(ref: str):
    return regex.findall(r"Nr[\s]*[0-9]+", ref)

In [773]:
def get_position(ref: str):
    return regex.findall(r"poz\.[\s]*[0-9]+", ref)

In [774]:
def get_text_before_parenthesis(reference_text:str):
    return regex.findall(r"[\w\n\s\,\:\;\.\-]*(?=\(Dz\.[ ]*U\.[ ]*[ \t\w\n \. \, \; \[\]\-\+\*\/]*\))", reference_text)

In [775]:
def get_bill_year_from_text(text:str):
    return regex.findall(r"[0-9]{4}", text)

In [776]:
def get_bill_title(text:str):
    return regex.findall(r"(?<=r\.)[\s\w\,\.\?\/\'\"\-]*", text)

In [777]:
def count_bill(bill:str):
    all_references = find_references(bill)
    for reference in all_references:
        extracted_reference = extract_references(reference)
        text = get_text_before_parenthesis(reference)[0]
        divided_reference = divide_references(extracted_reference)
        if divided_reference == []:
            continue
        first_reference = divided_reference[0]
        first_year = handle_first_reference(first_reference, reference)
        number_positions = get_number_and_position_from_reference(first_reference)
        for nr_pos in number_positions[1:]:
            number = get_bill_number(nr_pos)
            position = get_bill_position(nr_pos)
            record = BillRecord(first_year, position, number=number)
            key = f"{first_year}rpoz{position}"
            if key not in reference_dict:
                reference_dict[key] = record
            else:
                reference_dict[key].increment_occurrences()
            
        for divided in divided_reference[1:]:
            year = get_year_from_reference(divided)
            year = handle_year(year)
            number_positions = get_number_and_position_from_reference(divided)
            for nr_pos in number_positions:
                number = get_bill_number(nr_pos)
                position = get_bill_position(nr_pos)
                record = BillRecord(year, position)
                key = f"{year}rpoz{position}"
                if key not in reference_dict:
                    record.number=number
                    reference_dict[key] = record
                else:
                    reference_dict[key].increment_occurrences()
                    if reference_dict[key].number == '' and number != '':
                        reference_dict[key].number = number

In [778]:
def handle_first_reference(first_reference: str, whole_reference:str):
    year = get_year_from_reference(first_reference)
    text = get_text_before_parenthesis(whole_reference)[0]
    if year == []:
        year = get_bill_year_from_text(text)
    year = handle_year(year)
    number_positions = get_number_and_position_from_reference(first_reference)
    if number_positions == []:
        return None
    first_number = get_bill_number(number_positions[0])
    first_position = get_bill_position(number_positions[0])
    title = get_bill_title(text)
    record = BillRecord(year,first_position, title=clean_title(title))
    key = f"{year}rpoz{first_position}"
    if key not in reference_dict:
        record.number=first_number
        reference_dict[key] = record
    else:
        reference_dict[key].increment_occurrences()
        if reference_dict[key].title == "":
            reference_dict[key].title = clean_title(title)
        if reference_dict[key].number == '' and first_number != '':
            reference_dict[key].number = first_number
    return year


In [779]:
def handle_year(year: str):
    if year == []:
        return None
    year = year[-1]
    year = regex.findall(r"[0-9]{4}", year)[0]
    return year

In [780]:
def get_bill_number(number_position: str):
    number = get_number(number_position)
    if number == []:
        return None
    number = regex.findall(r"[0-9]+", number[0])[0]
    return number

In [781]:
def get_bill_position(number_position: str):
    position = get_position(number_position)
    if position == []:
        return None
    position = regex.findall(r"[0-9]+", position[0])
    return position[0]

In [782]:
def clean_title(title: List[str]):
    if title == []:
        return
    title = title[0]
    title = title.replace('\n', ' ')
    return title

In [783]:
load_bills('./data')

In [784]:
bills =  reference_dict.values()

In [785]:
import json

In [786]:
sorted_bills = sorted(bills, key=lambda x: x.occurrences, reverse=True)

In [787]:
for bill in sorted_bills[:15]:
    print(bill)

year:1998,
 position:668, 
 title: o zmianie niektórych ustaw określających kompetencje organów administracji publicznej - w związku z reformą ustrojową państwa , 
 occurences:609, 

year:1996,
 position:496, 
 title:      o Rządowym Centrum Studiów Strategicznych , 
 occurences:447, 

year:2000,
 position:136, 
 title: o zmianie niektórych ustaw związanych z funkcjonowaniem administracji publicznej , 
 occurences:342, 

year:1997,
 position:770, 
 title: - Przepisy wprowadzające ustawę o            Krajowym Rejestrze Sądowym , 
 occurences:326, 

year:1997,
 position:554, 
 title: Przepisy wprowadzające Kodeks karny , 
 occurences:268, 

year:1997,
 position:153, 
 title: o powszechnym ubezpieczeniu zdrowotnym , 
 occurences:239, 

year:1990,
 position:198, 
 title: o podziale zadań i kompetencji określonych  w ustawach szczególnych pomiędzy organy gmin a organy administracji rządowej oraz o zmianie niektórych ustaw , 
 occurences:212, 

year:1998,
 position:1118, 
 title: o emerytura

### Task 2

In [788]:
class InternalRecord:
    def __init__(self, article, bill):
        self.article = article
        self.bill = bill
        self.occurrences = 1
    
    def increment_occurrences(self):
        self.occurrences +=1
    def __repr__(self):
        return f"art. {self.article}, ust. {self.bill} ocurrences: {self.occurrences}\n"

In [789]:
all_internal_references ={}

In [806]:
def load_bills_internal_ref(directory_path: str):
    files = [file for file in listdir(directory_path) if isfile(join(directory_path, file))]
    for file in files:
        with open(f"{directory_path}/{file}") as input_file:
            input_bill = input_file.read()
            internal_refs, number = count_internal(input_bill)
            internal_refs = internal_refs.values()
            sorted_refs = sorted(internal_refs, key=lambda x: x.occurrences, reverse=True)
            all_internal_references[file] = (number, sorted_refs)

In [807]:
def slice_beetwee_art(bill: str):
    return regex.findall(r"[Aa]rt\.[\w\s\,\.\(\)\"\:\;\-]*?(?=[Aa]rt\.)|[Aa]rt\.[\w\s\,\.\(\)\"\:\;\-]*$", bill)

In [808]:
def get_art_number(article:str):
    art_sign = regex.findall(r"[Aa]rt\.[\s]*[0-9]+", article)
    if art_sign == []:
        return None
    art_sign = art_sign[0]
    return regex.findall(r"[0-9]+", art_sign)[0]

In [809]:
def get_bills(article: str):
    return regex.findall(r"[Uu]st\.[\s]*[\{0-9}\s\,\.\;i\{oraz}]+", article)

In [810]:
def get_bill_numbers(bill_s:str):
    return regex.findall(r"[0-9]+", bill_s)

In [811]:
def count_internal(bill:str):
    internal_reference_dict = {}
    articles = slice_beetwee_art(bill)
    all_refernces_number = 0
    for article in articles:
        article_number = get_art_number(article)
        bills_inside = get_bills(article)
        for bill_inside in bills_inside:
            bill_number = get_bill_numbers(bill_inside)
            if bill_number != []:
                bill_number = bill_number[0]
                all_refernces_number += 1
                key = f"art{article_number}ust{bill_number}"
                if key not in internal_reference_dict:
                    record = InternalRecord(article_number, bill_number)
                    internal_reference_dict[key] = record
                else:
                    internal_reference_dict[key].increment_occurrences()
    return (internal_reference_dict, all_refernces_number)

In [812]:
load_bills_internal_ref('./data')

In [813]:
items = all_internal_references.items()

In [814]:
sorted_items = sorted(items, key=lambda entry: entry[1][0], reverse=True)

In [815]:
print(sorted_items[:15])

[('2000_696.txt', (1711, [art. 10, ust. 1 ocurrences: 35
, art. 6, ust. 1 ocurrences: 27
, art. 5, ust. 1 ocurrences: 26
, art. 11, ust. 1 ocurrences: 26
, art. 8, ust. 1 ocurrences: 22
, art. 2, ust. 1 ocurrences: 22
, art. 19, ust. 1 ocurrences: 21
, art. 10, ust. 2 ocurrences: 20
, art. 23, ust. 1 ocurrences: 20
, art. 7, ust. 1 ocurrences: 18
, art. 4, ust. 1 ocurrences: 17
, art. 10, ust. 3 ocurrences: 17
, art. 6, ust. 2 ocurrences: 16
, art. 3, ust. 1 ocurrences: 16
, art. 11, ust. 3 ocurrences: 16
, art. 13, ust. 2 ocurrences: 15
, art. 22, ust. 1 ocurrences: 15
, art. 20, ust. 1 ocurrences: 15
, art. 9, ust. 2 ocurrences: 13
, art. 13, ust. 1 ocurrences: 13
, art. 14, ust. 1 ocurrences: 13
, art. 13, ust. 3 ocurrences: 13
, art. 36, ust. 1 ocurrences: 13
, art. 8, ust. 2 ocurrences: 12
, art. 9, ust. 1 ocurrences: 12
, art. 21, ust. 1 ocurrences: 12
, art. 34, ust. 1 ocurrences: 12
, art. 1, ust. 1 ocurrences: 11
, art. 2, ust. 3 ocurrences: 11
, art. 17, ust. 2 ocurrences: 11

### Task 3

In [816]:
ustawa_pattern = r"\b[Uu][Ss][Tt][Aa][Ww][Aa]*[Yy]*[IiEe]*[ęĘ]*[ąĄ]*[oO]*[oOMm]*[aAmMiI]*[aAcChH]*\b"

In [817]:
def count_all_bill_word(directory_path: str):
    files = [file for file in listdir(directory_path) if isfile(join(directory_path, file))]
    bill_counter = 0
    for file in files:
        with open(f"{directory_path}/{file}") as input_file:
            input_bill = input_file.read()
            bill_counter += count_in_one_bill(input_bill)
    print(bill_counter)        

In [818]:
def count_in_one_bill(bill:str):
    return len(regex.findall(ustawa_pattern, bill))

In [819]:
count_all_bill_word("./data")

25094
