# Preprocessing

In [1]:
import numpy as np
from collections import defaultdict as dd
from typing import List, Dict, Set
import nltk
from tqdm import tqdm_notebook, tnrange
import pickle
from termcolor import colored


In [2]:
import collections

class OrderedSet(collections.MutableSet):

    def __init__(self, iterable=None):
        self.end = end = [] 
        end += [None, end, end]         # sentinel node for doubly linked list
        self.map = {}                   # key --> [key, prev, next]
        if iterable is not None:
            self |= iterable

    def __len__(self):
        return len(self.map)

    def __contains__(self, key):
        return key in self.map

    def add(self, key):
        if key not in self.map:
            end = self.end
            curr = end[1]
            curr[2] = end[1] = self.map[key] = [key, curr, end]

    def discard(self, key):
        if key in self.map:        
            key, prev, next = self.map.pop(key)
            prev[2] = next
            next[1] = prev

    def __iter__(self):
        end = self.end
        curr = end[2]
        while curr is not end:
            yield curr[0]
            curr = curr[2]

    def __reversed__(self):
        end = self.end
        curr = end[1]
        while curr is not end:
            yield curr[0]
            curr = curr[1]

    def pop(self, last=True):
        if not self:
            raise KeyError('set is empty')
        key = self.end[1][0] if last else self.end[2][0]
        self.discard(key)
        return key

    def __repr__(self):
        if not self:
            return '%s()' % (self.__class__.__name__,)
        return '%s(%r)' % (self.__class__.__name__, list(self))

    def __eq__(self, other):
        if isinstance(other, OrderedSet):
            return len(self) == len(other) and list(self) == list(other)
        return set(self) == set(other)


  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
cache = []

In [7]:
class Wikipedia:
    def __init__(self):
        self.lemmas = self._to_lemma_mapping()
        self.list = []
        key = ""
        with open("Dane/fp_wiki.txt") as f:
            cntr = -1
            for line in f:
                tokenized_line = line.split(": ")
                if tokenized_line[0] == "TITLE":
                    if tokenized_line[-1][-1] == "\n":
                        tokenized_line[-1] = tokenized_line[-1][:-1]
                    cntr+=1
                    self.list.append((tokenized_line[-1], []))
                    continue
                self.list[-1][1].append(line)
    
    def _to_lemma_mapping(self) -> Dict[str, List[str]]:
        all_lemmas = {}
        for line in open('Dane/polimorfologik-2.1.txt', encoding='utf-8'):
            L = line.split(';')[:2]
            if L[1].lower() not in all_lemmas or L[0].lower() == L[1].lower():
                all_lemmas[L[1].lower()] = L[0].lower()
        return all_lemmas
    
    def _to_lemmas(self, words):
        result = []
        for w in words:
            w = w.lower()
            if w in self.lemmas:
                result.append(self.lemmas[w])
            else:
                result.append(w)
        return result
    
    def _load_word(self, w, alternative_load = True):
        if not os.path.isfile(f"Database/{w}"):
            return (OrderedSet(), OrderedSet(), OrderedSet(), OrderedSet())
        if alternative_load:
            with open( f"Database/{w}", "r") as ifile:
                res = []
                for line in ifile:
                    res.append(OrderedSet([int(el) for el in line.split(" ")]))
                return res
        else:
            return pickle.load(open( f"Database/{w}", "rb"))
        
    def _load(self, w, alternative_load = True):    
        res = self._load_word(w, alternative_load)
        if w not in self.lemmas:
            return res
        tmp = self._load_word(self.lemmas[w], alternative_load)
        res[1] |= tmp[0] | tmp[1]
        res[3] |= tmp[2] | tmp[3]
        return res
    
    def search(self, query):
        tokenized_query = nltk.word_tokenize(query)
        vbs = [self._load(t.lower()) for t in tokenized_query]
        if len(vbs) == 0:
            return []
        res1 = vbs[0][0]
        res2 = vbs[0][1] | vbs[0][0]
        res3 = vbs[0][2] | vbs[0][1] | vbs[0][0]
        res4 = vbs[0][3] | vbs[0][2] | vbs[0][1] | vbs[0][0]

        for el in vbs:
            res1 &= el[0]
            res2 &= el[1] | el[0]
            res3 &= el[2] | el[1] | el[0]
            res4 &= el[3] | el[2] | el[1] | el[0]
            
        res = list(res1) + list(res2 - res1) + list(res3 - res2 - res1) + list(res4 - res3 - res2 - res1)
        return res
    
    def search_and_print(self, query):
        articles = self.search(query)
        tokens = nltk.word_tokenize()
        lemmas = self.to_lemmas(tokens)
        for el in articles:
            title, article = self.list[el]
            for w in nltk.word_tokenize(title):
                if w in self.lemmas and self.lemmas[w] in lemmas:
                    print(colored(w, green), end=" ")
                else:
                    print(w, end=" ")
            print("\n")
            for line in articles:
                for w in line:
                    if w in self.lemmas and self.lemmas[w] in lemmas:
                        print(colored(w, green), end=" ")
                else:
                    print(w, end=" ")
                print("")
            print("")
        
    def preprocess(self, alternative_dump = True):
        global cache
        titles = dd(lambda: OrderedSet())
        titles_lemmas = dd(lambda: OrderedSet())
        articles = dd(lambda: OrderedSet())
        articles_lemmas = dd(lambda: OrderedSet())
        words = set()
        
        for i in tnrange(len(self.list)):
            l = self.list[i]
            title, article = l
            article = "".join(article)
            tokens = nltk.word_tokenize(title)
            for w in tokens:
                w = w.lower()
                words.add(w)
                titles[w].add(i)
            for w in self._to_lemmas(tokens):
                
                titles_lemmas[w].add(i)
#                 print(article)
            tokens = nltk.word_tokenize(article)
            for w in tokens:
                w = w.lower()
                words.add(w)
                articles[w].add(i)
            for w in self._to_lemmas(tokens):
                articles_lemmas[w].add(i)
#         print(words)
        cache.append(titles)
        cache.append(titles_lemmas)
        cache.append(articles)
        cache.append(articles_lemmas)
        cache.append(words)

        for w in tqdm_notebook(words):
            lemma_w = w if w not in self.lemmas else self.lemmas[w]
            if alternative_dump:
                with open( f"Database/{w}", "w") as ofile:
                    for el in titles[w]:
                        ofile.write(f"{el} ")
                    ofile.write("\n")
                    for el in titles_lemmas[lemma_w]:
                        ofile.write(f"{el} ")
                    ofile.write("\n")
                    for el in articles[w]:
                        ofile.write(f"{el} ")
                    ofile.write("\n")
                    for el in articles_lemmas[w]:
                        ofile.write(f"{el} ")
                    ofile.write("\n")
                    
            else:
                pickle.dump((
                    titles[w],
                    titles_lemmas[lemma_w],
                    articles[w],
                    articles_lemmas[lemma_w]
                ), open( f"Database/{w}", "wb" ))
            
        

In [8]:
wiki = Wikipedia()

In [None]:
wiki.preprocess()



HBox(children=(FloatProgress(value=0.0, max=1208362.0), HTML(value='')))

In [None]:
wiki