In [1]:
import nltk
import pandas as pd
import numpy as np
from parser import parse_xml_file
from stop_words import get_stop_words
from string_helper import replace_string_with_pattern, clear_empty_string, decode_html_chars
import majka


In [2]:
# configuration
DEBUG = True

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

stop_words = get_stop_words('cz')

morph = majka.Majka('../wordlists/majka.w-lt')
morph.flags |= majka.ADD_DIACRITICS  # find word forms with diacritics
morph.flags |= majka.DISALLOW_LOWERCASE  # do not enable to find lowercase variants
morph.flags |= majka.IGNORE_CASE  # ignore the word case whatsoever
morph.flags = 0  # unset all flags
morph.tags = False  # return just the lemma, do not process the tags
morph.first_only = True  # return only the first entry

In [3]:
# load file paths

xml_file_path = '../data/test.xml' if DEBUG else '../data/cswiki-latest-pages-articles.xml'
csv_file_path = '../data/parsed_output.csv'

In [4]:
parse_xml_file(xml_file_path, csv_file_path)

In [5]:
def clear_data(dataframe):
    for col in dataframe:
        dataframe[col] = dataframe[col]\
        .apply(lambda x: x if x is not np.NaN else None) \
        .apply(replace_string_with_pattern,pattern=r"[\\/:.,\-_#() ]+|{.*}",replace_with=' ') \
        .apply(replace_string_with_pattern,pattern=r"&amp;",replace_with='&') \
        .apply(decode_html_chars) \
        .apply(clear_empty_string)

    return dataframe

In [6]:
def tokenize(dataframe):
    for col in dataframe:
        dataframe[col] = dataframe[col]\
            .apply(lambda data: nltk.tokenize.word_tokenize(data) if data is not None else [])\
            .apply(lambda tokens: [token for token in tokens if not token in stop_words])
    return dataframe

In [7]:
def lemmatize(dataframe):
    for col in dataframe:
        dataframe[col] = dataframe[col]\
            .apply(lambda tokens: [morphed_token[0]['lemma'] for token in tokens if (morphed_token := morph.find(token))])
    return dataframe

In [8]:
# TODO: Uncomment reading by chunks
# for chunk in pd.read_csv(csv_file_path,sep=",",header=None,chunksize=50):
#     chunk = clear_data(chunk)
#     chunk = tokenize(chunk)
#     chunk = lemmatize(chunk)

In [9]:
# TODO: Remove reading into memory
df = pd.read_csv(csv_file_path,sep=",",header=None)
df = clear_data(df)
df = tokenize(df)
df = lemmatize(df)