In [1]:
import nltk
import pandas as pd
import numpy as np
from parser import parse_xml_file
from stop_words import get_stop_words
from string_helper import replace_string_with_pattern, clear_empty_string, decode_html_chars
import majka
from ast import literal_eval

In [2]:
# Configuration
DEBUG = True

if (DEBUG):
    pd.set_option('display.max_rows', 500)
    pd.set_option('display.width', 1000)

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Czech stopwords
stop_words = get_stop_words('cz')

# Custom stopwords (missing from official czech stop words package)
stop_words.extend(['a','v','vo','na','k','u','i','zo','z','so','s',])

# Czech lemmatizer Majka
morph = majka.Majka('../wordlists/majka.w-lt')
morph.flags |= majka.ADD_DIACRITICS  # Find word forms with diacritics
morph.flags |= majka.DISALLOW_LOWERCASE  # Do not enable to find lowercase variants
morph.flags |= majka.IGNORE_CASE  # Ignore the word case whatsoever
morph.flags = 0  # Unset all flags
morph.tags = False  # Return just the lemma
morph.first_only = True  # Return only the first entry

# Histogram of unlemmatizable words
unlemmatized_words_histogram = {}
treshold = 2

In [3]:
print(stop_words)

['ačkoli', 'ahoj', 'ale', 'anebo', 'ano', 'asi', 'aspoň', 'během', 'bez', 'beze', 'blízko', 'bohužel', 'brzo', 'bude', 'budeme', 'budeš', 'budete', 'budou', 'budu', 'byl', 'byla', 'byli', 'bylo', 'byly', 'bys', 'čau', 'chce', 'chceme', 'chceš', 'chcete', 'chci', 'chtějí', 'chtít', "chut'", 'chuti', 'co', 'čtrnáct', 'čtyři', 'dál', 'dále', 'daleko', 'děkovat', 'děkujeme', 'děkuji', 'den', 'deset', 'devatenáct', 'devět', 'do', 'dobrý', 'docela', 'dva', 'dvacet', 'dvanáct', 'dvě', 'hodně', 'já', 'jak', 'jde', 'je', 'jeden', 'jedenáct', 'jedna', 'jedno', 'jednou', 'jedou', 'jeho', 'její', 'jejich', 'jemu', 'jen', 'jenom', 'ještě', 'jestli', 'jestliže', 'jí', 'jich', 'jím', 'jimi', 'jinak', 'jsem', 'jsi', 'jsme', 'jsou', 'jste', 'kam', 'kde', 'kdo', 'kdy', 'když', 'ke', 'kolik', 'kromě', 'která', 'které', 'kteří', 'který', 'kvůli', 'má', 'mají', 'málo', 'mám', 'máme', 'máš', 'máte', 'mé', 'mě', 'mezi', 'mí', 'mít', 'mně', 'mnou', 'moc', 'mohl', 'mohou', 'moje', 'moji', 'možná', 'můj', 'musí

In [4]:
# Load file paths
xml_file_path = '../data/test.xml' if DEBUG else '../data/cswiki-latest-pages-articles.xml'
parsed_file_path = '../data/out-parsed.csv'
tokenized_file_path = '../data/out-tokenized.csv'
lemmatized_file_path = '../data/out-lemmatized.csv'

In [5]:
# Parse input file line by line 
parse_xml_file(xml_file_path, parsed_file_path)

In [6]:
def clear_data(dataframe):
    for col in dataframe:
        dataframe[col] = dataframe[col]\
        .apply(lambda x: x if x is not np.NaN else None) \
        .apply(replace_string_with_pattern,pattern=r"[\\/:.,\-_#()?“„ ]+|{.*}",replace_with=' ') \
        .apply(replace_string_with_pattern,pattern=r"&amp;",replace_with='&') \
        .apply(decode_html_chars) \
        .apply(clear_empty_string)

    return dataframe

In [7]:
def tokenize(dataframe):
    for col in dataframe:
        dataframe[col] = dataframe[col]\
            .apply(lambda data: nltk.tokenize.word_tokenize(data) if data is not None else [])\
            .apply(lambda tokens: [token.lower() for token in tokens if not token in stop_words])
    return dataframe

In [8]:
def add_unlemmatized_words(dataframe,histogram):
    for col in dataframe:
        for row in dataframe[col]:
            for word in row:
                if not (morph.find(word)):
                    histogram[word] = histogram[word]+1 if word in histogram else 1


In [9]:
def lemmatize(dataframe):
    for col in dataframe:
        dataframe[col] = dataframe[col]\
            .apply(lambda tokens: [y for token in literal_eval(tokens) if (y := (morphed_token[0]['lemma'] if (morphed_token := morph.find(token)) else token if unlemmatized_words_histogram[token] >= treshold else None)) is not None])
    return dataframe

In [10]:
# Process parsed file chunk by chunk
for chunk in pd.read_csv(parsed_file_path,sep=",",header=None,names=['0','1'],encoding='UTF-8',chunksize=50):
    chunk = clear_data(chunk)
    chunk = tokenize(chunk)
    add_unlemmatized_words(chunk,unlemmatized_words_histogram)
    chunk.to_csv(tokenized_file_path, mode='a+', header=False, index=False,encoding='UTF-8')

In [11]:
for chunk in pd.read_csv(tokenized_file_path,sep=",",header=None,names=['0','1'],encoding='UTF-8',chunksize=50):
    chunk = lemmatize(chunk)
    chunk.to_csv(lemmatized_file_path, mode='a+', header=False, index=False,encoding='UTF-8')