# Przetwarzanie języka naturalnego – lab4
## Mateusz Kocot

In [1]:
import os
from collections import defaultdict, Counter
from functools import reduce
import math
from joblib import Parallel, delayed
import time
import requests
import pickle

import numpy as np
import pandas as pd

import regex as re
from spacy.lang.pl import Polish as PolishSpacy

In [2]:
N_JOBS = 8

DATA_DIR = '../ustawy'

# Load data

In [3]:
dataset = {}

for file_name in os.listdir(DATA_DIR):
    with open(f'{DATA_DIR}/{file_name}', 'r', encoding='UTF-8') as file:
        bill = file.read().lower()
    dataset[file_name] = bill
    
len(dataset)

1179

# Task 1

In [4]:
nlp_spacy = PolishSpacy()
tokenizer = nlp_spacy.tokenizer

In [5]:
def tokenize(name, text):
    return name, list(map(str, tokenizer(text)))

def tokenize_collection(collection):
    return sum(([tokenize(name, text)] for name, text in collection), [])
    
start = time.time()
bill_items = list(dataset.items())
tokenized_dataset = Parallel(n_jobs=N_JOBS)(delayed(tokenize_collection)(bill_items[i:i+100]) for i in range(0, len(bill_items), 100))
tokenized_dataset = sum(tokenized_dataset, [])
tokenized_dataset = dict(tokenized_dataset)
print(f'Elapsed {time.time() - start:0.2f} s')

Elapsed 21.33 s


# Task 2

In [6]:
def count_ngrams(text, n=2):
    ngram_dict = defaultdict(int)
    for i in range(len(text) - n + 1):
        ngram = tuple(text[i:i+n]) if n > 1 else text[i]
        ngram_dict[ngram] += 1
    return Counter(ngram_dict)

count_ngrams(['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'quick', 'brown', 'fox', '.'])

Counter({('the', 'quick'): 2,
         ('quick', 'brown'): 2,
         ('brown', 'fox'): 2,
         ('fox', 'jumps'): 1,
         ('jumps', 'over'): 1,
         ('over', 'the'): 1,
         ('fox', '.'): 1})

In [7]:
def count_ngrams_collection(collection, n=2):
    return sum([count_ngrams(bill, n=n) for bill in collection], Counter())

def compute_ngram_counts(dataset, n=2):
    start = time.time()
    bills = list(dataset.values())
    ngram_counts = Parallel(n_jobs=N_JOBS)(delayed(count_ngrams_collection)(bills[i:i+100], n=n) for i in range(0, len(bills), 100))
    ngram_counts = sum(ngram_counts, Counter())
    print(f'Elapsed {time.time() - start:0.2f} s')
    return ngram_counts

bigram_counts = compute_ngram_counts(tokenized_dataset)

bigram_counts.most_common()[:10]

Elapsed 14.79 s


[(('art', '.'), 83778),
 (('ust', '.'), 53552),
 (('.', '\n'), 49741),
 (('poz', '.'), 45198),
 ((',', 'poz'), 39655),
 (('-', '-'), 36542),
 (('r', '.'), 33010),
 (('w', 'art'), 30170),
 (('.', '1'), 29715),
 ((',', 'o'), 28739),
 (('mowa', 'w'), 27649),
 (('w', 'ust'), 22238),
 ((',', 'w'), 21526),
 (('2', '.'), 21274),
 (('1', '.'), 21111)]

# Task 3

In [8]:
def fix_ngram_counts(ngram_counts, lemmatized=False):
    fixed_bigram_counts = Counter()
    for ngram, count in ngram_counts.items():
        # if all tokens contain only letters
        if all([not re.match(r'.*[^\p{L}]', token if not lemmatized else token[0]) for token in ngram]):
            fixed_bigram_counts[ngram] = count
    return fixed_bigram_counts

fixed_bigram_counts = fix_ngram_counts(bigram_counts)
            
fixed_bigram_counts.most_common()[:10]

[(('w', 'art'), 30170),
 (('mowa', 'w'), 27649),
 (('w', 'ust'), 22238),
 (('których', 'mowa'), 12973),
 (('o', 'których'), 12604),
 (('otrzymuje', 'brzmienie'), 9168),
 (('z', 'dnia'), 8989),
 (('którym', 'mowa'), 8689),
 (('o', 'którym'), 8525),
 (('do', 'spraw'), 8215)]

# Task 4

In [9]:
def compute_token_counts(dataset, lemmatized=False):
    token_counts = sum([count_ngrams(bill, n=1) for name, bill in dataset.items()], Counter())
    for token in list(token_counts.keys()):
        # if token contains any non-letter character
        if re.match(r'.*[^\p{L}]', token if not lemmatized else token[0]): 
            del token_counts[token]
    return token_counts

token_counts = compute_token_counts(tokenized_dataset)

token_counts.most_common()[:10]

[('w', 201200),
 ('i', 90006),
 ('art', 83804),
 ('z', 82438),
 ('o', 64776),
 ('do', 60732),
 ('ust', 53636),
 ('na', 50643),
 ('się', 45886),
 ('lub', 45800)]

In [10]:
dataset_size = sum(token_counts.values())
dataset_size

3566806

In [11]:
def count_pmi(dataset_size, ngram_count, *token_counts):
    frac = ngram_count / math.prod(token_counts)
    for _ in range(len(token_counts) - 1):
        frac *= dataset_size # multiply by the dataset size since we are dealing with probabilities
    return np.log(frac)

count_pmi(50_000_952, 1159, 1938, 1311)

10.034908170336502

In [12]:
def compute_ngrams_pmi(fixed_ngram_counts, token_counts):
    ngrams_pmi = {}
    for ngram, ngram_count in fixed_ngram_counts.items():
        single_token_counts = [token_counts[token] for token in ngram]
        pmi = count_pmi(dataset_size, ngram_count, *single_token_counts)
        ngrams_pmi[ngram] = pmi
        
    return ngrams_pmi

bigrams_pmi = compute_ngrams_pmi(fixed_bigram_counts, token_counts)
bigrams_pmi = Counter(bigrams_pmi)

# Task 5

In [13]:
bigrams_pmi.most_common()[:10]

[(('kołowe', 'jednoosiowe'), 15.087181075421553),
 (('zbrojeń', 'żelbeto'), 15.087181075421553),
 (('prefabrykatów', 'wnętrzowe'), 15.087181075421553),
 (('gołe', 'aluminiowe'), 15.087181075421553),
 (('polistyrenu', 'spienionego'), 15.087181075421553),
 (('objaśnieniem', 'figur'), 15.087181075421553),
 (('wkładzie', 'wnoszonym'), 15.087181075421553),
 (('doktorem', 'habilitowanym'), 15.087181075421553),
 (('losy', 'loteryjne'), 15.087181075421553),
 (('ugaszone', 'zapałki'), 15.087181075421553)]

# Task 6

In [14]:
filtered_bigrams_pmi = Counter(dict(filter(lambda x: fixed_bigram_counts[x[0]] >= 5, bigrams_pmi.items())))

filtered_bigrams_pmi.most_common()[:10]

[(('świeckie', 'przygotowujące'), 13.477743162987451),
 (('klęskami', 'żywiołowymi'), 13.477743162987451),
 (('ręcznego', 'miotacza'), 13.477743162987451),
 (('stajnią', 'wyścigową'), 13.477743162987451),
 (('otworami', 'wiertniczymi'), 13.477743162987451),
 (('obcowania', 'płciowego'), 13.477743162987451),
 (('młyny', 'kulowe'), 13.477743162987451),
 (('młynki', 'młotkowe'), 13.477743162987451),
 (('zaszkodzić', 'wynikom'), 13.477743162987451),
 (('grzegorz', 'schetyna'), 13.477743162987451)]

# Task 7

I used https://hub.docker.com/r/djstrong/krnnt/ instead of https://hub.docker.com/r/djstrong/krnnt2. The `output_format` option does not work in the latter one.

In [50]:
print(requests.post('http://localhost:9003/?output_format=jsonl', data='ala miała kota, ale już nie ma.'.encode('utf-8')).json())

[[['ala', 'Ala', 'subst:sg:nom:f'], ['miała', 'mieć', 'praet:sg:f:imperf'], ['kota', 'kot', 'subst:sg:acc:m2'], [',', ',', 'interp'], ['ale', 'ale', 'conj'], ['już', 'już', 'qub'], ['nie', 'nie', 'qub'], ['ma', 'mieć', 'fin:sg:ter:imperf'], ['.', '.', 'interp']]]


##### At first I lemmatize the bills and save the results using pickle. Elapsed time: 22 min

In [17]:
def lemmatize(text):
    lemmatized_text = sum(requests.post('http://localhost:9003/?output_format=jsonl', data=text.encode('utf-8')).json(), [])
    lemmatized_tokens = list(map(lambda x: (x[1], x[2].split(':', 1)[0]), lemmatized_text))
    return lemmatized_tokens

# start = time.time()
# for i, (name, bill) in enumerate(dataset.items()):
#     if i % 10 == 0:
#         print(f'Processing bill no {i}')
#     lemmatized_text = lemmatize(bill)
#     with open(f'lemmatized_corpus/{name[:-4]}.pkl', 'wb') as file:
#         pickle.dump(lemmatized_text, file)
# print(f'Elapsed {time.time() - start:0.2f} s')

##### Now I can load lemmatized bills.

In [18]:
lemmatized_dataset = {}
for name in dataset.keys():
    with open(f'lemmatized_corpus/{name[:-4]}.pkl', 'rb') as file:
        lemmatized_dataset[name] = pickle.load(file)

In [19]:
list(lemmatized_dataset.values())[0][:10]

[('Dziennik Ustaw', 'brev'),
 ('.', 'interp'),
 ('z', 'prep'),
 ('1993', 'adj'),
 ('rok', 'brev'),
 ('.', 'interp'),
 ('numer', 'brev'),
 ('129', 'num'),
 (',', 'interp'),
 ('pozycja', 'brev')]

# Task 8 and 9

In [20]:
lem_bigram_counts = compute_ngram_counts(lemmatized_dataset)
lem_bigram_counts.most_common()[:10]

Elapsed 11.54 s


[((('artykuł', 'brev'), ('.', 'interp')), 83776),
 ((('ustęp', 'brev'), ('.', 'interp')), 53390),
 ((('pozycja', 'brev'), ('.', 'interp')), 45221),
 (((',', 'interp'), ('pozycja', 'brev')), 43184),
 ((('.', 'interp'), ('1', 'adj')), 39945),
 ((('-', 'interp'), ('-', 'interp')), 36580),
 ((('rok', 'brev'), ('.', 'interp')), 33026),
 ((('w', 'prep'), ('artykuł', 'brev')), 32039),
 (((',', 'interp'), ('o', 'prep')), 29908),
 ((('o', 'prep'), ('który', 'adj')), 28655),
 ((('który', 'adj'), ('mowa', 'subst')), 28538),
 ((('mowa', 'subst'), ('w', 'prep')), 28467),
 ((('.', 'interp'), ('2', 'adj')), 26032),
 ((('w', 'prep'), ('ustęp', 'brev')), 23554),
 ((('.', 'interp'), ('artykuł', 'brev')), 22921)]

In [21]:
# Remove bigrams with tokens conatining non-alphanumeric characters
fixed_lem_bigram_counts = fix_ngram_counts(lem_bigram_counts, lemmatized=True)
fixed_lem_bigram_counts.most_common()[:10]

[((('w', 'prep'), ('artykuł', 'brev')), 32039),
 ((('o', 'prep'), ('który', 'adj')), 28655),
 ((('który', 'adj'), ('mowa', 'subst')), 28538),
 ((('mowa', 'subst'), ('w', 'prep')), 28467),
 ((('w', 'prep'), ('ustęp', 'brev')), 23554),
 ((('z', 'prep'), ('dzień', 'subst')), 11360),
 ((('otrzymywać', 'fin'), ('brzmienie', 'subst')), 10529),
 ((('określić', 'ppas'), ('w', 'prep')), 10019),
 ((('do', 'prep'), ('sprawa', 'subst')), 8697),
 ((('ustawa', 'subst'), ('z', 'prep')), 8625)]

# Task 10

In [22]:
lem_token_counts = compute_token_counts(lemmatized_dataset, lemmatized=True)
lem_token_counts.most_common()[:10]

[(('w', 'prep'), 202659),
 (('i', 'conj'), 90025),
 (('z', 'prep'), 87989),
 (('artykuł', 'brev'), 83792),
 (('o', 'prep'), 64690),
 (('do', 'prep'), 60757),
 (('ustęp', 'brev'), 53449),
 (('na', 'prep'), 50647),
 (('który', 'adj'), 49383),
 (('się', 'qub'), 45888)]

In [23]:
lem_bigrams_pmi = compute_ngrams_pmi(fixed_lem_bigram_counts, lem_token_counts)
lem_bigrams_pmi = Counter(lem_bigrams_pmi)

In [24]:
print('Before filtering:')
lem_bigrams_pmi.most_common()[:10]

Before filtering:


[((('tornister', 'subst'), ('nieskórzany', 'adj')), 15.087181075421553),
 ((('cji', 'xxx'), ('wadociągowych', 'adj')), 15.087181075421553),
 ((('zbrojenia', 'subst'), ('żelbeto', 'adja')), 15.087181075421553),
 ((('reduktor', 'subst'), ('membranowy', 'adj')), 15.087181075421553),
 ((('prefabrykat', 'subst'), ('wnętrzowy', 'adj')), 15.087181075421553),
 ((('polistyren', 'subst'), ('spienić', 'ppas')), 15.087181075421553),
 ((('UW', 'subst'), ('zględnieniu', 'subst')), 15.087181075421553),
 ((('któ', 'adj'), ('rych', 'adj')), 15.087181075421553),
 ((('zaniedbać', 'ppas'), ('wychowawczo', 'adv')), 15.087181075421553),
 ((('english', 'subst'), ('language', 'xxx')), 15.087181075421553)]

In [25]:
filtered_lem_bigrams_pmi = Counter(dict(filter(lambda x: fixed_lem_bigram_counts[x[0]] >= 5, lem_bigrams_pmi.items())))

print('After filtering:')
filtered_lem_bigrams_pmi.most_common()[:10]

After filtering:


[((('młynek', 'subst'), ('młotkowy', 'adj')), 13.477743162987451),
 ((('Grzegorz', 'subst'), ('schetyna', 'subst')), 13.477743162987451),
 ((('łańcuchowy', 'adj'), ('rozszczepienie', 'subst')), 13.477743162987451),
 ((('pasta', 'subst'), ('emulsyjny', 'adj')), 13.295421606193496),
 ((('chrom', 'subst'), ('sześciowartościowy', 'adj')), 13.295421606193496),
 ((('Adam', 'subst'), ('Mickiewicz', 'subst')), 13.295421606193496),
 ((('młyn', 'subst'), ('kulowy', 'adj')), 13.14127092636624),
 ((('środa', 'subst'), ('wlkp', 'brev')), 13.14127092636624),
 ((('Piotrków', 'subst'), ('trybunalski', 'adj')), 13.14127092636624),
 ((('przeponowy', 'adj'), ('rurowy', 'adj')), 13.007739533741717)]

# Task 11 and 12

I could not come up with any reasonable method for counting the PMI values of trigrams which would use the bigram values. I mean, we could multiply the PMIs of bigram_1 (token_1, token_2) and bigram_2 (token_2, token_3), then divide it by the bigram_1 and bigram_2 counts, multiply by the number of token_2 occurrences, and finally multiply by the number of trigram occurrences. The only thing we would gain is not having to know the number of occurrences of token_1 and token_3, however.

## Tokenized dataset

In [26]:
trigram_counts = compute_ngram_counts(tokenized_dataset, n=3)
fixed_trigram_counts = fix_ngram_counts(trigram_counts)

print('Filtered most common trigrams:')
fixed_trigram_counts.most_common()[:10]

Elapsed 25.98 s
Filtered most common trigrams


[(('których', 'mowa', 'w'), 12506),
 (('mowa', 'w', 'ust'), 12388),
 (('o', 'których', 'mowa'), 11714),
 (('mowa', 'w', 'art'), 10995),
 (('którym', 'mowa', 'w'), 8429),
 (('o', 'którym', 'mowa'), 8040),
 (('której', 'mowa', 'w'), 5020),
 (('o', 'której', 'mowa'), 4731),
 (('właściwy', 'do', 'spraw'), 4434),
 (('minister', 'właściwy', 'do'), 4104)]

In [27]:
trigrams_pmi = compute_ngrams_pmi(fixed_trigram_counts, token_counts)
trigrams_pmi = Counter(trigrams_pmi)

filtered_trigrams_pmi = Counter(dict(filter(lambda x: fixed_trigram_counts[x[0]] >= 5, trigrams_pmi.items())))

print('Top 10 PMI after filtering:')
filtered_trigrams_pmi.most_common()[:10]

Top 10 PMI after filtering:


[(('profilem', 'zaufanym', 'epuap'), 24.964420728246495),
 (('finałowego', 'turnieju', 'mistrzostw'), 24.700691896817204),
 (('przedwczesnego', 'wyrębu', 'drzewostanu'), 24.58039454219411),
 (('potwierdzonym', 'profilem', 'zaufanym'), 24.49791972955196),
 (('piłce', 'nożnej', 'uefa'), 24.457334449436882),
 (('cienką', 'sierścią', 'zwierzęcą'), 24.215420795440682),
 (('szybkiemu', 'postępowi', 'technicznemu'), 24.170475083736566),
 (('turnieju', 'mistrzostw', 'europy'), 24.170063645755036),
 (('grożącą', 'jemu', 'samemu'), 24.003079177159286),
 (('wypalonym', 'paliwem', 'jądrowym'), 23.951785882771734)]

## Lemmatized dataset

In [28]:
lem_trigram_counts = compute_ngram_counts(lemmatized_dataset, n=3)
fixed_lem_trigram_counts = fix_ngram_counts(lem_trigram_counts, lemmatized=True)

print('Filtered most common trigrams:')
fixed_lem_trigram_counts.most_common()[:10]

Elapsed 23.13 s
Filtered most common trigrams


[((('o', 'prep'), ('który', 'adj'), ('mowa', 'subst')), 28534),
 ((('który', 'adj'), ('mowa', 'subst'), ('w', 'prep')), 28436),
 ((('mowa', 'subst'), ('w', 'prep'), ('ustęp', 'brev')), 13471),
 ((('mowa', 'subst'), ('w', 'prep'), ('artykuł', 'brev')), 12308),
 ((('ustawa', 'subst'), ('z', 'prep'), ('dzień', 'subst')), 8589),
 ((('właściwy', 'adj'), ('do', 'prep'), ('sprawa', 'subst')), 7945),
 ((('minister', 'subst'), ('właściwy', 'adj'), ('do', 'prep')), 7888),
 ((('w', 'prep'), ('droga', 'subst'), ('rozporządzenie', 'subst')), 4737),
 ((('zastępować', 'fin'), ('się', 'qub'), ('wyraz', 'subst')), 3653),
 ((('w', 'prep'), ('ustawa', 'subst'), ('z', 'prep')), 3646)]

In [29]:
lem_trigrams_pmi = compute_ngrams_pmi(fixed_lem_trigram_counts, lem_token_counts)
lem_trigrams_pmi = Counter(lem_trigrams_pmi)

filtered_lem_trigrams_pmi = Counter(dict(filter(lambda x: fixed_lem_trigram_counts[x[0]] >= 5, lem_trigrams_pmi.items())))

print('Top 10 PMI after filtering:')
filtered_lem_trigrams_pmi.most_common()[:10]

Top 10 PMI after filtering:


[((('porcelanowy', 'adj'), ('młyn', 'subst'), ('kulowy', 'adj')),
  25.925866908793743),
 ((('wymiennik', 'subst'), ('przeponowy', 'adj'), ('rurowy', 'adj')),
  25.322331886923486),
 ((('reakcja', 'subst'), ('łańcuchowy', 'adj'), ('rozszczepienie', 'subst')),
  24.927338078682617),
 ((('finałowy', 'adj'), ('turniej', 'subst'), ('mistrzostwa', 'subst')),
  24.633098605684676),
 ((('piłka', 'subst'), ('nożny', 'adj'), ('UEFA', 'subst')),
  24.10316263571627),
 ((('turniej', 'subst'), ('mistrzostwa', 'subst'), ('europ', 'subst')),
  23.844641245320407),
 ((('przedwczesny', 'adj'), ('wyrąb', 'subst'), ('drzewostan', 'subst')),
  23.692172651676135),
 ((('mecz', 'subst'), ('piłka', 'subst'), ('nożny', 'adj')),
  23.592337011950278),
 ((('profil', 'subst'), ('zaufany', 'adj'), ('epuap', 'subst')),
  23.531875349475847),
 ((('milion', 'brev'), ('dolar', 'subst'), ('USA', 'subst')),
  23.419758051355142)]

# Task 13

In [30]:
def tok_to_lem(tok_tuple):
    string = ' '.join(tok_tuple)
    return tuple(lemmatize(string))

def lem_to_tok(lem_tuple):
    return tuple((l[0].lower() for l in lem_tuple))

## Bigrams

### Top 10 tokenized vs lemmatized

In [101]:
tok, tok_pmi = tuple(zip(*filtered_bigrams_pmi.most_common()[:10]))
lem = [tok_to_lem(t) for t in tok]
lem_pmi = [lem_bigrams_pmi[l] for l in lem]

data = {
    'tokens': [str(t) for t in tok],
    'PMI (tokenized)': tok_pmi,
    'lemmatized tokens': [str(l) for l in lem],
    'PMI (lemmatized)': lem_pmi
}

pd.DataFrame(data)

Unnamed: 0,tokens,PMI (tokenized),lemmatized tokens,PMI (lemmatized)
0,"('świeckie', 'przygotowujące')",13.477743,"(('świecki', 'adj'), ('przygotowywać', 'pact'))",12.219282
1,"('klęskami', 'żywiołowymi')",13.477743,"(('klęska', 'subst'), ('żywiołowy', 'adj'))",10.306923
2,"('ręcznego', 'miotacza')",13.477743,"(('ręczny', 'adj'), ('miotacz', 'subst'))",11.398302
3,"('stajnią', 'wyścigową')",13.477743,"(('stajnia', 'subst'), ('wyścigowy', 'adj'))",10.920516
4,"('otworami', 'wiertniczymi')",13.477743,"(('otwór', 'subst'), ('wiertniczy', 'adj'))",11.098197
5,"('obcowania', 'płciowego')",13.477743,"(('obcowanie', 'subst'), ('płciowy', 'adj'))",12.448124
6,"('młyny', 'kulowe')",13.477743,"(('młyn', 'subst'), ('kulowy', 'adj'))",13.141271
7,"('młynki', 'młotkowe')",13.477743,"(('młynek', 'subst'), ('młotkowy', 'adj'))",13.477743
8,"('zaszkodzić', 'wynikom')",13.477743,"(('zaszkodzić', 'inf'), ('wynik', 'subst'))",7.275613
9,"('grzegorz', 'schetyna')",13.477743,"(('Grzegorz', 'subst'), ('schetyna', 'subst'))",13.477743


### Top 10 lemmatized vs tokenized

This way it doesn't make a lot of sense, because we lose the forms of the words, but... why not :)

In [81]:
lem, lem_pmi = tuple(zip(*filtered_lem_bigrams_pmi.most_common()[:10]))
tok = [lem_to_tok(l) for l in lem]
tok_pmi = [bigrams_pmi[t] for t in tok]

data = {
    'lemmatized tokens': [str(l) for l in lem],
    'PMI (lemmatized)': lem_pmi,
    'tokens': [str(t) for t in tok],
    'PMI (tokenized)': tok_pmi
}

pd.DataFrame(data)

Unnamed: 0,lemmatized tokens,PMI (lemmatized),tokens,PMI (tokenized)
0,"(('młynek', 'subst'), ('młotkowy', 'adj'))",13.477743,"('młynek', 'młotkowy')",0.0
1,"(('Grzegorz', 'subst'), ('schetyna', 'subst'))",13.477743,"('grzegorz', 'schetyna')",13.477743
2,"(('łańcuchowy', 'adj'), ('rozszczepienie', 'su...",13.477743,"('łańcuchowy', 'rozszczepienie')",0.0
3,"(('pasta', 'subst'), ('emulsyjny', 'adj'))",13.295422,"('pasta', 'emulsyjny')",0.0
4,"(('chrom', 'subst'), ('sześciowartościowy', 'a...",13.295422,"('chrom', 'sześciowartościowy')",0.0
5,"(('Adam', 'subst'), ('Mickiewicz', 'subst'))",13.295422,"('adam', 'mickiewicz')",0.0
6,"(('młyn', 'subst'), ('kulowy', 'adj'))",13.141271,"('młyn', 'kulowy')",0.0
7,"(('środa', 'subst'), ('wlkp', 'brev'))",13.141271,"('środa', 'wlkp')",13.477743
8,"(('Piotrków', 'subst'), ('trybunalski', 'adj'))",13.141271,"('piotrków', 'trybunalski')",13.141271
9,"(('przeponowy', 'adj'), ('rurowy', 'adj'))",13.00774,"('przeponowy', 'rurowy')",0.0


### Example

In [98]:
for a, b in bigrams_pmi:
    if (a.endswith('papier')):
        print(a, b)

papier wartościowy
papier oszczędnościowy
papier gazetowy
papier toaletowy
papier do


#### Lemmatized

In [86]:
lem_bigrams_pmi[tok_to_lem(('papier', 'wartościowy'))]

7.903017378323081

In [94]:
lem_bigrams_pmi[tok_to_lem(('papier', 'oszczędnościowy'))]

4.852521019359115

In [95]:
lem_bigrams_pmi[tok_to_lem(('papier', 'gazetowy'))]

7.943563472717431

In [96]:
lem_bigrams_pmi[tok_to_lem(('papier', 'toaletowy'))]

6.238815380479005

#### Tokenized

In [89]:
bigrams_pmi[('papier', 'wartościowy')]

10.609844260943346

In [90]:
bigrams_pmi[('papier', 'oszczędnościowy')]

10.492061225286962

In [91]:
bigrams_pmi[('papier', 'gazetowy')]

11.590673513955071

In [92]:
bigrams_pmi[('papier', 'toaletowy')]

11.590673513955071

# Trigrams

### Top 10 tokenized vs lemmatized

In [33]:
tok, tok_pmi = tuple(zip(*filtered_trigrams_pmi.most_common()[:10]))
lem = [tok_to_lem(t) for t in tok]
lem_pmi = [lem_trigrams_pmi[l] for l in lem]

data = {
    'tokens': [str(t) for t in tok],
    'PMI (tokenized)': tok_pmi,
    'lemmatized tokens': [str(l) for l in lem],
    'PMI (lemmatized)': lem_pmi
}

pd.DataFrame(data)

Unnamed: 0,tokens,PMI (tokenized),lemmatized tokens,PMI (lemmatized)
0,"('profilem', 'zaufanym', 'epuap')",24.964421,"(('profil', 'subst'), ('zaufany', 'adj'), ('ep...",23.531875
1,"('finałowego', 'turnieju', 'mistrzostw')",24.700692,"(('finałowy', 'adj'), ('turniej', 'subst'), ('...",24.633099
2,"('przedwczesnego', 'wyrębu', 'drzewostanu')",24.580395,"(('przedwczesny', 'adj'), ('wyręb', 'subst'), ...",23.921014
3,"('potwierdzonym', 'profilem', 'zaufanym')",24.49792,"(('potwierdzić', 'ppas'), ('profil', 'subst'),...",20.937769
4,"('piłce', 'nożnej', 'uefa')",24.457334,"(('piłka', 'subst'), ('nożny', 'adj'), ('UEFA'...",24.103163
5,"('cienką', 'sierścią', 'zwierzęcą')",24.215421,"(('cienki', 'adj'), ('sierść', 'subst'), ('zwi...",19.407921
6,"('szybkiemu', 'postępowi', 'technicznemu')",24.170475,"(('szybki', 'adj'), ('postęp', 'subst'), ('tec...",16.570299
7,"('turnieju', 'mistrzostw', 'europy')",24.170064,"(('turniej', 'subst'), ('mistrzostwa', 'subst'...",23.844641
8,"('grożącą', 'jemu', 'samemu')",24.003079,"(('grozić', 'pact'), ('on', 'ppron3'), ('sam',...",10.169878
9,"('wypalonym', 'paliwem', 'jądrowym')",23.951786,"(('wypalić', 'ppas'), ('paliwo', 'subst'), ('j...",17.617158


### Top 10 lemmatized vs tokenized

In [34]:
lem, lem_pmi = tuple(zip(*filtered_lem_trigrams_pmi.most_common()[:10]))
tok = [lem_to_tok(l) for l in lem]
tok_pmi = [trigrams_pmi[t] for t in tok]

data = {
    'lemmatized tokens': [str(l) for l in lem],
    'PMI (lemmatized)': lem_pmi,
    'tokens': [str(t) for t in tok],
    'PMI (tokenized)': tok_pmi
}

pd.DataFrame(data)

Unnamed: 0,lemmatized tokens,PMI (lemmatized),tokens,PMI (tokenized)
0,"(('porcelanowy', 'adj'), ('młyn', 'subst'), ('...",25.925867,"('porcelanowy', 'młyn', 'kulowy')",0
1,"(('wymiennik', 'subst'), ('przeponowy', 'adj')...",25.322332,"('wymiennik', 'przeponowy', 'rurowy')",0
2,"(('reakcja', 'subst'), ('łańcuchowy', 'adj'), ...",24.927338,"('reakcja', 'łańcuchowy', 'rozszczepienie')",0
3,"(('finałowy', 'adj'), ('turniej', 'subst'), ('...",24.633099,"('finałowy', 'turniej', 'mistrzostwa')",0
4,"(('piłka', 'subst'), ('nożny', 'adj'), ('UEFA'...",24.103163,"('piłka', 'nożny', 'uefa')",0
5,"(('turniej', 'subst'), ('mistrzostwa', 'subst'...",23.844641,"('turniej', 'mistrzostwa', 'europ')",0
6,"(('przedwczesny', 'adj'), ('wyrąb', 'subst'), ...",23.692173,"('przedwczesny', 'wyrąb', 'drzewostan')",0
7,"(('mecz', 'subst'), ('piłka', 'subst'), ('nożn...",23.592337,"('mecz', 'piłka', 'nożny')",0
8,"(('profil', 'subst'), ('zaufany', 'adj'), ('ep...",23.531875,"('profil', 'zaufany', 'epuap')",0
9,"(('milion', 'brev'), ('dolar', 'subst'), ('USA...",23.419758,"('milion', 'dolar', 'usa')",0


# Task 14

#### Why do we have to filter the bigrams, rather than the token sequence?
This way we avoid creating bigrams composed of tokens from separate sentences, paragraphs, etc. Let's consider this example: "Ula ma psa. Pies ma Ulę". If we filter the token sequence before creating bigrams, we will end up with "psa pies".

#### Which method works better for the bigrams and which for the trigrams?
- Tokenization seems to be working a bit better for bigrams. Most often it performs similarly to lemmatization, but there are words which can make a lot of collocations, and they are more likely to be discovered using tokenization. For example, let's take the word "papier". It is used together with "wartościowy", "oszczędnościowy", "gazetowy" and "toaletowy". As found in the example a few cells above, these collocations are easier to discover using the tokenized corpus.
- On the other hand, lemmatization appears to be better for trigrams. The tokenization approach assigned high PMIs to a few phrases that aren't collocations, e.g. 'cienką sierścią zwierzęcą', 'grożącą jemu samemu'. This effect occurred much less often for bigrams. The lemmatized trigrams with high PMI are more reasonable.

#### What types of expressions are discovered by the methods?
- names, e.g. "Adam Mickiewicz"
- proper nouns, e.g. "Środa Wielkopolska"
- proper nouns with common descriptions, e.g "profil zaufany EPUAP", "mecz piłki nożnej"
- words that together form another meaning (noun + adj, noun + noun), e.g. "ręczny miotacz", "turniej mistrzostwa europy", "otwór wiertniczy"
- words that are just often used together (common adjective, etc.), e.g. "przedwczesny wyrąb drzewostanu", "potwierdzony profil zaufany", "szybki postęp techniczny"
- random phrases, e.g. "grożącą jemu samemu"

#### Can you devise a different type of filtering that would yield better results?
Apart from removing words containing non-letter characters, we could also remove some common words (mostly conjunctions), like 'w', 'do', 'a', 'ja', 'ty', 'i', 'dlatego', etc. It would remove phrases like "papier do" from the dataset, which would strengthen collocations like "papier wartościowy".