In [1]:
import nltk
import pandas as pd
import numpy as np
from parser import parse_xml_file
from stop_words import get_stop_words
from string_helper import replace_string_with_pattern, clear_empty_string, decode_html_chars
import majka

In [2]:
# Configuration
DEBUG = True # Whether to use test dataset or full data
PRESERVE_UNRECOGNIZED_WORDS = True # Preserve words not found in lemma wordlist
PRESERVE_NUMBERS = True # Preserve numbers in dataset

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Czech stopwords
stop_words = get_stop_words('cz')

# Czech lemmatizer Majka
morph = majka.Majka('../wordlists/majka.w-lt')
morph.flags |= majka.ADD_DIACRITICS  # Find word forms with diacritics
morph.flags |= majka.DISALLOW_LOWERCASE  # Do not enable to find lowercase variants
morph.flags |= majka.IGNORE_CASE  # Ignore the word case whatsoever
morph.flags = 0  # Unset all flags
morph.tags = False  # Return just the lemma
morph.first_only = True  # Return only the first entry

In [3]:
# Load file paths
xml_file_path = '../data/test.xml' if DEBUG else '../data/cswiki-latest-pages-articles.xml'
parsed_file_path = '../data/out-parsed.csv'
lemmatized_file_path = '../data/out-lemmatized.csv'

In [4]:
# Parse input file line by line 
parse_xml_file(xml_file_path, parsed_file_path)

In [5]:
def clear_data(dataframe):
    for col in dataframe:
        dataframe[col] = dataframe[col]\
        .apply(lambda x: x if x is not np.NaN else None) \
        .apply(replace_string_with_pattern,pattern=r"[\\/:.,\-_#()“„ ]+|{.*}",replace_with=' ') \
        .apply(replace_string_with_pattern,pattern=r"&amp;",replace_with='&') \
        .apply(lambda x: replace_string_with_pattern(x,pattern=r"[0-9]",replace_with='') if not PRESERVE_NUMBERS else x) \
        .apply(decode_html_chars) \
        .apply(clear_empty_string)

    return dataframe

In [6]:
def tokenize(dataframe):
    for col in dataframe:
        dataframe[col] = dataframe[col]\
            .apply(lambda data: nltk.tokenize.word_tokenize(data) if data is not None else [])\
            .apply(lambda tokens: [token for token in tokens if not token in stop_words])
    return dataframe

In [7]:
def lemmatize(dataframe):
    for col in dataframe:
        dataframe[col] = dataframe[col]\
            .apply(lambda tokens: [morphed_token[0]['lemma'] if (morphed_token := morph.find(token)) else token if PRESERVE_UNRECOGNIZED_WORDS else None for token in tokens])
    return dataframe

In [8]:
# Process parsed file chunk by chunk
for chunk in pd.read_csv(parsed_file_path,sep=",",header=2,encoding='UTF-8',chunksize=50):
    chunk = clear_data(chunk)
    chunk = tokenize(chunk)
    chunk = lemmatize(chunk)
    chunk.to_csv(lemmatized_file_path, mode='a+', header=False, index=False,encoding='UTF-8')