In [32]:
import glob
from tqdm.auto import tqdm, trange
import requests
import xml.etree.ElementTree as et
import json
import numpy as np
from sklearn.naive_bayes import MultinomialNB, CategoricalNB, BernoulliNB, GaussianNB

## generate XMLs with KRNNT tagger (localhost:9003)

In [8]:
def tag_texts(dirname):
    filenames = glob.glob(dirname+'/*.txt')
    for filename in tqdm(filenames):
        with open(filename, 'r', encoding='utf-8') as f:
            r = requests.post('http://localhost:9003?output_format=XCES', data=f.read().encode('utf-8'))
        with open(filename.replace('.txt', '.xml'), 'w', encoding='utf-8') as f:
            f.write(r.text)
            
#tag_texts('wiki_test_34_categories_data')
#tag_texts('wiki_train_34_categories_data')

## load flexems translation to wordclasses


In [9]:
wordclass_dict = {}
with open('fleksemy.csv', 'r') as f:
    for line in f.readlines():
        line = line.split(',')
        wordclass_dict[line[1]] = line[3].replace('\n', '')

## load xmls into one dataset

In [10]:
def load_dataset(dirname):
    datasets = {'rzeczownik':[], 'czasownik':[], 'przymiotnik':[]}
    filenames = glob.glob(dirname+"/*.xml")
    for filename in tqdm(filenames):
        tags_list = []
        root = et.parse(filename).getroot()
        for x in root.findall('chunk/sentence/tok/lex') + root.findall('chunkList/chunk/chunk/tok/lex'):
            if x.get('disamb') == '1':
                base = x.find('base').text
                tag = x.find('ctag').text.split(':')[0]
                wordclass = wordclass_dict[tag]
                tags_list.append((base, wordclass))

        category = filename.split('\\')[1].split('_')[0]
        for key in datasets:
            datasets[key].append((category, [base for base, wordclass in tags_list if wordclass == key]))
    return datasets
        

In [11]:
train_ds = load_dataset('wiki_train_34_categories_data')
test_ds = load_dataset('wiki_test_34_categories_data')

HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2953.0), HTML(value='')))




## trivial bag of words and NaiveBayes classification

In [48]:
dict_size = 5000

bow = {}
for key in train_ds:
    print(key)
    
    print('building dictionary')
    dictionary = {}
    for _, words in tqdm(train_ds[key]):
        for word in words:
            if word not in dictionary:
                dictionary[word] = 1
            else:
                dictionary[word] += 1
                
    # dict reducing - only N most frequent words
    dictionary = sorted(dictionary.items(), key = lambda x: x[1], reverse=True)[:dict_size]
    print(dictionary[:10])
    dictionary = {key: value for key, value in dictionary}
    print('dictionary length: ', len(dictionary))
    
    print('defining datasets as BoW')
    train_X = np.array([[1. if word in words else 0. for word in dictionary] for _, words in tqdm(train_ds[key])])
    _, train_y = np.unique(np.array([category for category, _ in train_ds[key]]), return_inverse=True)
    
    test_X = np.array([[1. if word in words else 0. for word in dictionary] for _, words in tqdm(test_ds[key])])
    _, test_y = np.unique(np.array([category for category, _ in test_ds[key]]), return_inverse=True)
    
    print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
    bow[key] = (train_X, train_y, test_X, test_y)

rzeczownik
building dictionary


HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))


[('rok', 18301), ('miejsce', 4473), ('to', 3928), ('czas', 3555), ('Polska', 3000), ('bibliografia', 2873), ('świat', 2821), ('The', 2816), ('samolot', 2810), ('co', 2726)]
dictionary length:  5000
defining datasets as BoW


HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2953.0), HTML(value='')))


(6886, 5000) (6886,) (2953, 5000) (2953,)
czasownik
building dictionary


HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))


[('być', 30989), ('zostać', 9332), ('mieć', 6076), ('móc', 3425), ('występować', 1981), ('znajdować', 1710), ('prowadzić', 1510), ('posiadać', 1363), ('należeć', 1322), ('stosować', 1295)]
dictionary length:  5000
defining datasets as BoW


HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2953.0), HTML(value='')))


(6886, 5000) (6886,) (2953, 5000) (2953,)
przymiotnik
building dictionary


HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))


[('który', 11408), ('ten', 10431), ('1', 6356), ('pierwszy', 5036), ('polski', 4856), ('swój', 3878), ('inny', 3543), ('jeden', 3542), ('duży', 2820), ('zewnętrzny', 2762)]
dictionary length:  5000
defining datasets as BoW


HBox(children=(FloatProgress(value=0.0, max=6886.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2953.0), HTML(value='')))


(6886, 5000) (6886,) (2953, 5000) (2953,)


In [52]:
for key, (train_X, train_y, test_X, test_y) in bow.items():
    cnb = MultinomialNB()
    cnb.fit(train_X, train_y)
    train_score = cnb.score(train_X, train_y)
    test_score = cnb.score(test_X, test_y)
    print(key, train_score, test_score)

rzeczownik 0.9215800174266628 0.8614967829326109
czasownik 0.7468777229160616 0.5980358956992888
przymiotnik 0.8595701423177462 0.7500846596681341
