**Importy**

In [1]:
import regex
import json, ast
from csv import writer
import pandas as pd 
import numpy as np
from stop_words import get_stop_words

In [2]:
import majka
from majka import Majka
morph = Majka('../data/w-lt.sk.fsa')

morph.flags |= majka.ADD_DIACRITICS  # find word forms with diacritics
morph.flags |= majka.DISALLOW_LOWERCASE  # do not enable to find lowercase variants
morph.flags |= majka.IGNORE_CASE  # ignore the word case whatsoever
morph.flags = 0  # unset all flags

morph.tags = False  # return just the lemma, do not process the tags
# morph.tags = True  # turn tag processing back on (default)

morph.first_only = True  # return only the first entry
# morph.first_only = False  # return all entries (default)

In [3]:
from corpy.morphodita import Tagger
tagger = Tagger("../data/slovak-morfflex-pdt-170914.tagger")

INFO:corpy.morphodita:Loading tagger.


In [4]:
import nltk
nltk.download('punkt')
# nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rusna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**Premenne**

In [5]:
# file s jednou page
debug_filepath = "../data/test_data_1page.xml"
# cely file
input_filepath = "../data/skwiki-latest-pages-articles.xml"
output_filepath = "../data/anchors.csv"
THRESHOLD = 2

**Funkcie**

In [6]:
# otvorenie suboru, zadefinovanie hlavicky csv suboru a nasledne citanie s pracovanie po riadkoch
def open_read(input_filepath,output_filepath,process):
    with open(input_filepath, encoding='UTF-8') as input_file ,open(output_filepath, 'w+', encoding='UTF-8' ) as output_file:
        csv_writer = writer(output_file, delimiter='|')
        csv_writer.writerow(["Link", "Alt_Text"])
        for line in input_file:
            process(line,csv_writer)

In [7]:
# využitie regexu na nájdene všetkých zhôd v riadku a zapísanie do csv súboru. 
def process(line,csv_writer):
    regex_patern = '\[\[([^|\]\]]+)\|?([^|\]\]]+)?\]\]'
    #\[\[([^|\]\]]+)\|?([^|\]\]]+)?\]\]   #\[\[([\p{L}:\s\{\}\.\/\(\)]+)\|?([^\]]*)?\]\]
    matches = regex.findall(regex_patern, line) 
    for match in matches:
        list_matche = list(filter(None, match))
        
        
        csv_writer.writerow(list_matche)

In [8]:
# funkcia pre upravu textu
def replace(text, regex_pattern, substitution):
    if not isinstance(text, str):
        return text
    return regex.sub(regex_pattern,substitution,text)

In [19]:
# funkcia pre tokenizovanie textu. Text -> Vety -> Slova (Tokeny) vyuzitim nltk 
def tokenizer_modded(text):
#     print('text:', text)
    if not pd.isna(text):
#         print(type(text))
        
        if isinstance(text, int) or isinstance(text, float):
            text = str(text)
            
    
        sentences = nltk.sent_tokenize(text)
#         print('sentence:',sentences)
        tokens = []
        for sent in sentences:
            tokens.append(nltk.word_tokenize(sent))
        flatten_list = [j for sub in tokens for j in sub] 
#         print('list',flatten_list)

        result = [token.lower() for token in flatten_list if token not in ".,?!...-nan"]

        return result
    else:
        return []

In [10]:
# lematizovanie textu s odstranenim stop slov. Vyuziva sa majka so Slovenskym Slovnikom
def lematizer_modded(text):
    stop_words = get_stop_words('sk')

    result = []
    for word in ast.literal_eval(text):
        morph_word = morph.find(word)
#         Aktualne slova ktore nemaju lemu, budu mat povodny tvar.
        if morph_word:
            result.append(morph_word[0]['lemma'])
        else:
            if hashmap_unlem[word] > THRESHOLD:
                result.append(word)
            
    for word in result:  # odstranenie stopslov
        if word in stop_words:
            result.remove(word)
    return result

In [20]:
def fill_hashmap(text):
  
    for word in text:
        morph_word = morph.find(word)
        if morph_word == []:
            
            if not word in hashmap_unlem:
                hashmap_unlem[word] = 1
            else:
                if hashmap_unlem[word] <= THRESHOLD:
                    hashmap_unlem[word] += 1 

    

In [12]:
def process_data(df):
    for column in df:
        df[column] = df[column] \
        .apply(lambda x: x if x is not np.NaN else None) \
        .apply(replace,regex_pattern = r"[\\\/\(\):#.\_?!]|({{.*}})|\d*px\d*png\d*svg\d*jpg",substitution =' ')\
        .apply(replace,regex_pattern = r"([ ]+)",substitution =' ')\
        
    return df

In [13]:
def tokenize_data(df):
    for column in df:
        df[column] = df[column] \
        .apply(lambda text: tokenizer_modded(text) if text is not None else [] )
        
    return df

In [14]:
def lemetize_data(df):
    for column in df:
        df[column] = df[column] \
        .apply(lambda text: lematizer_modded(text) )
        
    return df

In [15]:
def create_hashmap(df):
    for column in df:
#         print(column)
        df[column] \
        .apply(lambda text: fill_hashmap(text) )
        


**Priebeh spracovania**

In [16]:
open_read(input_filepath,output_filepath,process)

In [22]:
# spracovanie suboru po chunkoch a postupne ukladanie vysledkov do csv
hashmap_unlem = {}  
for index,chunk in enumerate(pd.read_csv(output_filepath, chunksize=15, delimiter='|')):
    

    process_data(chunk)
    chunk = tokenize_data(chunk)
    create_hashmap(chunk)
    chunk.to_csv('../data/tokenized_data.csv', mode = 'a+', index=False, header=False, encoding='UTF-8')




In [25]:
for index,chunk in enumerate(pd.read_csv('../data/tokenized_data.csv',chunksize=15, names=["Link","Alt_Text"])):
    

    lemetize_data(chunk)
    
#     print('chunk ------------------------------------------------------------------------------',index)    
#     print(chunk)
    chunk.to_csv('../data/done.csv', mode = 'a+', index=False, encoding='UTF-8')