In [1]:
import numpy as np
import glob
import requests

In [2]:
PATH = '../lab1/ustawy/'
file_names = sorted(glob.glob(PATH + "*"))
file_names[:5]

['../lab1/ustawy/1993_599.txt',
 '../lab1/ustawy/1993_602.txt',
 '../lab1/ustawy/1993_645.txt',
 '../lab1/ustawy/1993_646.txt',
 '../lab1/ustawy/1994_150.txt']

In [3]:
law_acts =[]
for file_name in file_names:
    with open(file_name, "r", encoding="utf-8") as f:
        law_acts.append(f.read())

# Zadanie 2
Znalezienie tokenów z korpusu w postaci tupli (lemat (downcased), kategoria morfosyntaktyczna)

In [4]:
%%time
tokens_per_act = []

for law_act in law_acts[:5]:
    tokens = []
    response_lines = requests.post('http://localhost:9200', law_act.encode('utf-8')).content.decode('utf-8').split('\n')
    for line in response_lines:
        line_words = line.split('\t')
        if line_words[0] == '' and len(line_words) >= 2:
            tokens.append((line_words[1].lower().strip(), line_words[2].split(':')[0]))
    tokens_per_act.append(tokens)

CPU times: user 91.6 ms, sys: 11.1 ms, total: 103 ms
Wall time: 27.1 s


In [5]:
tokens_per_act[0][:10]

[('dziennik', 'brev'),
 ('.', 'interp'),
 ('ustawa', 'brev'),
 ('.', 'interp'),
 ('z', 'prep'),
 ('1993', 'adj'),
 ('rok', 'brev'),
 ('.', 'interp'),
 ('numer', 'brev'),
 ('129', 'num')]

# Zadanie 3
Obliczenie licznika bigramów powstałych z opisanych wyżej tokenów

In [6]:
%%time
bigrams_counter = {}
for tokens in tokens_per_act:
    bigrams = zip(tokens[:-1], tokens[1:])
    for bigram in bigrams:
        bigrams_counter[bigram] = bigrams_counter.get(bigram, 0) + 1

CPU times: user 23 ms, sys: 3.81 ms, total: 26.8 ms
Wall time: 26.9 ms


# Zadanie 4
Odfiltrowanie bigramów zawierających nie litery

In [7]:
%%time
new_bigrams_counter = {}
for bigram, count in bigrams_counter.items():
    if bigram[0][0].isalpha() and bigram[1][0].isalpha():
        new_bigrams_counter[bigram] = count
bigrams_counter = new_bigrams_counter

CPU times: user 0 ns, sys: 6.14 ms, total: 6.14 ms
Wall time: 6.04 ms


# Zadanie 6
Obliczenie LLR dla znalezionych bigramów

In [8]:
total_number_of_bigrams = sum(list(bigrams_counter.values()))
total_number_of_bigrams

8550

In [9]:
# compute counter of bigrams starting with a
bigrams_starting_with_a_counter = {}
for bigram, count in bigrams_counter.items():
    a = bigram[0]
    bigrams_starting_with_a_counter[a] = bigrams_starting_with_a_counter.get(a, 0) + count
    
# compute counter of bigrams ending with b
bigrams_ending_with_b_counter = {}
for bigram, count in bigrams_counter.items():
    b = bigram[1]
    bigrams_ending_with_b_counter[b] = bigrams_ending_with_b_counter.get(b, 0) + count

In [10]:
def H(k):
    N = np.sum(k)
    return np.sum(k/N * np.log(k/N + (k==0)))

In [11]:
%%time
bigrams_llr = {}
for bigram, count in bigrams_counter.items():
    a, b = bigram
    a_and_b = count
    a_without_b = bigrams_starting_with_a_counter[a] - a_and_b
    b_but_not_a = bigrams_ending_with_b_counter[b] - a_and_b
    neither_a_nor_b = total_number_of_bigrams - bigrams_starting_with_a_counter[a] \
        - bigrams_ending_with_b_counter[b] + a_and_b
    
    k = np.array([[a_and_b, b_but_not_a], [a_without_b, neither_a_nor_b]])
    row_sums = np.array([a_and_b + b_but_not_a, a_without_b + neither_a_nor_b])
    col_sums = np.array([a_and_b + a_without_b, b_but_not_a + neither_a_nor_b])
    llr = 2 * np.sum(k) * (H(k) - H(row_sums) - H(col_sums))
    bigrams_llr[bigram] = llr

CPU times: user 704 ms, sys: 2 µs, total: 704 ms
Wall time: 708 ms


In [12]:
list(bigrams_llr.items())[:10]

[((('ustawa', 'subst'), ('z', 'prep')), 104.33712266279504),
 ((('z', 'prep'), ('dzień', 'subst')), 95.19012075063563),
 ((('o', 'prep'), ('zmiana', 'subst')), 26.0815334089783),
 ((('zmiana', 'subst'), ('ustawa', 'subst')), 14.376997774075955),
 ((('ustawa', 'subst'), ('o', 'prep')), 26.271080105618438),
 ((('o', 'prep'), ('podatek', 'subst')), 51.49542125103479),
 ((('podatek', 'subst'), ('od', 'prep')), 88.87138877410224),
 ((('od', 'prep'), ('towar', 'subst')), 97.4817680254182),
 ((('towar', 'subst'), ('i', 'conj')), 177.92900056044124),
 ((('i', 'conj'), ('usługa', 'subst')), 187.30547222383495)]

# Zadanie 7
Wyznaczenie partycji bigramów zawierających tokeny z tymi samymi kategoriami

In [13]:
partitions = {}
partitions_size = {}

for bigram, count in bigrams_counter.items():
    partition = (bigram[0][1], bigram[1][1])
    partitions[partition] = partitions.get(partition, []) + [(bigram[0][0], bigram[1][0])]
    partitions_size[partition] = partitions_size.get(partition, 0) + count

# Zadanie 8
Wyznaczenie 10 najliczniejszych partycji

In [14]:
largest_partitions = sorted(list(partitions_size.items()), key=(lambda x: (-x[1], x[0])))[:10]
largest_partitions

[(('prep', 'subst'), 1119),
 (('subst', 'adj'), 817),
 (('subst', 'subst'), 666),
 (('subst', 'prep'), 548),
 (('adj', 'subst'), 524),
 (('prep', 'brev'), 373),
 (('conj', 'subst'), 304),
 (('subst', 'conj'), 291),
 (('prep', 'adj'), 255),
 (('ppas', 'prep'), 225)]

# Zadanie 9
Wyznaczenie 5 bigramów z najwyższym LLR reprezentujących każdą z partycji

In [15]:
categories_representatives = {}
sorted_bigrams = sorted(list(bigrams_llr.items()), key=(lambda x: (-x[1], x[0])))
for partition, size in largest_partitions:
    representatives_found = 0
    for bigram, llr in sorted_bigrams:
        if bigram[0][1] == partition[0] and bigram[1][1] == partition[1]:
            categories_representatives[partition] = categories_representatives.get(partition, []) + [((bigram[0][0], bigram[1][0]), llr)]
            representatives_found += 1
            if representatives_found == 5:
                break

In [16]:
for partition, representatives in categories_representatives.items():
    print('Partition:', partition)
    print('Representatives:')
    for r in representatives:
        print(r)
    print()

Partition: ('prep', 'subst')
Representatives:
(('z', 'wyjątek'), 148.38829747654103)
(('na', 'podstawa'), 147.05413756154223)
(('po', 'wyraz'), 143.62085761954634)
(('dla', 'dziecko'), 118.3874096133523)
(('na', 'cel'), 116.41876512284286)

Partition: ('subst', 'adj')
Representatives:
(('rok', 'podatkowy'), 328.8796640252817)
(('podatek', 'dochodowy'), 238.95807076934037)
(('urząd', 'skarbowy'), 218.63804117900764)
(('rachunek', 'bankowy'), 168.14153388393328)
(('działalność', 'gospodarczy'), 127.18188579529782)

Partition: ('subst', 'subst')
Representatives:
(('minister', 'finanse'), 200.23878120202173)
(('zwrot', 'różnica'), 172.69666317230906)
(('różnica', 'podatek'), 121.41265054404309)
(('droga', 'rozporządzenie'), 119.47678407979583)
(('sprzedaż', 'towar'), 81.86417754884829)

Partition: ('subst', 'prep')
Representatives:
(('mowa', 'w'), 399.21826588815907)
(('ustawa', 'z'), 104.33712266279504)
(('podatek', 'od'), 88.87138877410224)
(('przepis', 'o'), 71.28160909613385)
(('dzieck

# Wnioski

1. What types of bigrams have been found?

what do you mean bro?

2. Which of the category-pairs indicate valuable multiword expressions? Do they have anything in common?

what do you mean bro?

3. Which signal: LLR score or syntactic category is more useful for determining genuine multiword expressions?

what do you mean bro?

4. Can you describe a different use-case where the morphosyntactic category is useful for resolving a real-world problem?

what do you mean bro?
