In [1]:
import glob
from tqdm.auto import tqdm, trange
import requests
import xml.etree.ElementTree as et
import json
import numpy as np
from sklearn.naive_bayes import MultinomialNB, CategoricalNB, BernoulliNB, GaussianNB

## generate XMLs with KRNNT tagger (localhost:9003)

In [2]:
def tag_texts(dirname):
    filenames = glob.glob(dirname+'/*/*.txt')
    for filename in tqdm(filenames):
        with open(filename, 'r', encoding='utf-8') as f:
            r = requests.post('http://localhost:9003?output_format=XCES', data=f.read().encode('utf-8'))
        with open(filename.replace('.txt', '.xml'), 'w', encoding='utf-8') as f:
            f.write(r.text)
            
#tag_texts('wiki_data')

## load flexems translation to wordclasses


In [3]:
wordclass_dict = {}
with open('fleksemy.csv', 'r') as f:
    for line in f.readlines():
        line = line.split(',')
        wordclass_dict[line[1]] = line[3].replace('\n', '')

## load xmls into one dataset

In [8]:
def load_dataset(dirname):
    datasets = {'rzeczownik':[], 'czasownik':[], 'przymiotnik':[]}
    filenames = glob.glob(dirname+"/*.xml")
    for filename in tqdm(filenames):
        tags_list = []
        root = et.parse(filename).getroot()
        for x in root.findall('chunk/sentence/tok/lex') + root.findall('chunkList/chunk/chunk/tok/lex'):
            if x.get('disamb') == '1':
                base = x.find('base').text.split(':')[0]
                tag = x.find('ctag').text.split(':')[0]
                wordclass = wordclass_dict[tag]
                tags_list.append((base, wordclass))

        category = filename.split('\\')[1].split('_')[0]
        for key in datasets:
            datasets[key].append((category, [base for base, wordclass in tags_list if wordclass == key]))
    return datasets
        

In [9]:
train_ds = load_dataset('wiki_data/cmc_train')
test_ds = load_dataset('wiki_data/cmc_test')

HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2953.0), HTML(value='')))




## trivial bag of words and NaiveBayes classification

In [10]:
dict_size = 5000

bow = {}
for key in train_ds:
    print(key)
    
    print('building dictionary')
    dictionary = {}
    for _, words in tqdm(train_ds[key]):
        for word in words:
            if word not in dictionary:
                dictionary[word] = 1
            else:
                dictionary[word] += 1
                
    # dict reducing - only N most frequent words
    dictionary = sorted(dictionary.items(), key = lambda x: x[1], reverse=True)[:dict_size]
    print(dictionary[:10])
    dictionary = {key: value for key, value in dictionary}
    print('dictionary length: ', len(dictionary))
    
    print('defining datasets as BoW')
    train_X = np.array([[1. if word in words else 0. for word in dictionary] for _, words in tqdm(train_ds[key])])
    _, train_y = np.unique(np.array([category for category, _ in train_ds[key]]), return_inverse=True)
    
    test_X = np.array([[1. if word in words else 0. for word in dictionary] for _, words in tqdm(test_ds[key])])
    _, test_y = np.unique(np.array([category for category, _ in test_ds[key]]), return_inverse=True)
    
    print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
    bow[key] = (train_X, train_y, test_X, test_y)

rzeczownik
building dictionary


HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))


[('rok', 18302), ('miejsce', 4473), ('to', 3780), ('czas', 3560), ('Polska', 2944), ('bibliografia', 2873), ('samolot', 2812), ('świat', 2720), ('co', 2663), ('wersja', 2471)]
dictionary length:  5000
defining datasets as BoW


HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2953.0), HTML(value='')))


(6886, 5000) (6886,) (2953, 5000) (2953,)
czasownik
building dictionary


HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))


[('być', 30993), ('zostać', 9332), ('mieć', 4995), ('móc', 3430), ('występować', 1958), ('znajdować', 1853), ('prowadzić', 1557), ('posiadać', 1423), ('stosować', 1360), ('należeć', 1326)]
dictionary length:  5000
defining datasets as BoW


HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2953.0), HTML(value='')))


(6886, 5000) (6886,) (2953, 5000) (2953,)
przymiotnik
building dictionary


HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))


[('który', 11410), ('ten', 10605), ('pierwszy', 5041), ('polski', 4730), ('swój', 3886), ('jeden', 3559), ('inny', 3504), ('duży', 3004), ('zewnętrzny', 2762), ('nowy', 2725)]
dictionary length:  5000
defining datasets as BoW


HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2953.0), HTML(value='')))


(6886, 5000) (6886,) (2953, 5000) (2953,)


In [11]:
for key, (train_X, train_y, test_X, test_y) in bow.items():
    cnb = MultinomialNB()
    cnb.fit(train_X, train_y)
    train_score = cnb.score(train_X, train_y)
    test_score = cnb.score(test_X, test_y)
    print(key, train_score, test_score)

rzeczownik 0.9236131280859715 0.8645445309854385
czasownik 0.7492012779552716 0.5980358956992888
przymiotnik 0.8621841417368574 0.7660006772773451
