# Przetwarzanie języka naturalnego – lab4
## Mateusz Kocot

In [102]:
import os
from collections import defaultdict, Counter
from functools import reduce
import math

import numpy as np

import regex as re
from spacy.lang.pl import Polish as PolishSpacy

In [49]:
DATA_DIR = '../ustawy'

# Load data

In [50]:
dataset = {}

for file_name in os.listdir(DATA_DIR)[:10]: # TODO: REMOVE
    with open(f'{DATA_DIR}/{file_name}', 'r', encoding='UTF-8') as file:
        bill = file.read().lower()
    dataset[file_name] = bill
    
len(dataset)

10

# Task 1

In [51]:
nlp_spacy = PolishSpacy()
tokenizer = nlp_spacy.tokenizer

In [71]:
tokenized_dataset = {name: list(map(str, tokenizer(bill))) for name, bill in dataset.items()}

# Task 2

In [90]:
def count_ngrams(text, n=2):
    ngram_dict = defaultdict(int)
    for i in range(len(text) - n + 1):
        ngram = tuple(text[i:i+n]) if n > 1 else text[i]
        ngram_dict[ngram] += 1
    return Counter(ngram_dict)

count_ngrams(['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'quick', 'brown', 'fox', '.'])

Counter({('the', 'quick'): 2,
         ('quick', 'brown'): 2,
         ('brown', 'fox'): 2,
         ('fox', 'jumps'): 1,
         ('jumps', 'over'): 1,
         ('over', 'the'): 1,
         ('fox', '.'): 1})

In [91]:
bigram_counts = sum([count_ngrams(bill) for name, bill in tokenized_dataset.items()], Counter())

bigram_counts.most_common()[:15]

[(('-', '-'), 814),
 (('art', '.'), 805),
 (('.', '\n'), 789),
 (('ust', '.'), 655),
 (('|', '\n'), 519),
 (('|', '   '), 402),
 (('   ', '|'), 400),
 (('\n', '|'), 391),
 (('.', '1'), 294),
 (('\n                                        \n                                    ',
   'art'),
  291),
 (('w', 'ust'), 269),
 (('.',
   '\n                                        \n                                    '),
  266),
 ((',', 'o'), 246),
 (('w', 'art'), 244),
 (('mowa', 'w'), 235)]

# Task 3

In [92]:
fixed_bigram_counts = Counter()

for bigram, count in bigram_counts.items():
    if all([not re.match(r'.*[^\p{L}]', token) for token in bigram]): # if all tokens contain only letters
        fixed_bigram_counts[bigram] = count
            
fixed_bigram_counts.most_common()[:10]

[(('w', 'ust'), 269),
 (('w', 'art'), 244),
 (('mowa', 'w'), 235),
 (('z', 'rejestracji'), 119),
 (('stosuje', 'się'), 106),
 (('o', 'którym'), 103),
 (('o', 'których'), 102),
 (('prawa', 'z'), 102),
 (('prawa', 'ochronnego'), 102),
 (('którym', 'mowa'), 101)]

# Task 4

In [100]:
token_counts = sum([count_ngrams(bill, n=1) for name, bill in tokenized_dataset.items()], Counter())
for token in list(token_counts.keys()):
    if re.match(r'.*[^\p{L}]', token): # if token contains any non-letter character
        del token_counts[token]

token_counts.most_common()[:10]

[('w', 2072),
 ('z', 933),
 ('art', 806),
 ('i', 746),
 ('ust', 655),
 ('do', 646),
 ('o', 599),
 ('na', 580),
 ('się', 561),
 ('lub', 456)]

In [110]:
dataset_size = sum(token_counts.values())
dataset_size

36768

In [108]:
def count_pmi(dataset_size, ngram_count, *token_counts):
    frac = ngram_count / math.prod(token_counts)
    for _ in range(len(token_counts) - 1):
        frac *= dataset_size
    return np.log(frac)

count_pmi(50_000_952, 1159, 1938, 1311)

10.034908170336502