In [55]:
import numpy as np
from collections import defaultdict as dd
from typing import List, Dict, Set
import nltk
from tqdm import tqdm_notebook, tnrange
import pickle
from termcolor import colored
import sqlite3

In [56]:
import collections

class OrderedSet(collections.MutableSet):

    def __init__(self, iterable=None):
        self.end = end = [] 
        end += [None, end, end]         # sentinel node for doubly linked list
        self.map = {}                   # key --> [key, prev, next]
        if iterable is not None:
            self |= iterable

    def __len__(self):
        return len(self.map)

    def __contains__(self, key):
        return key in self.map

    def add(self, key):
        if key not in self.map:
            end = self.end
            curr = end[1]
            curr[2] = end[1] = self.map[key] = [key, curr, end]

    def discard(self, key):
        if key in self.map:        
            key, prev, next = self.map.pop(key)
            prev[2] = next
            next[1] = prev

    def __iter__(self):
        end = self.end
        curr = end[2]
        while curr is not end:
            yield curr[0]
            curr = curr[2]

    def __reversed__(self):
        end = self.end
        curr = end[1]
        while curr is not end:
            yield curr[0]
            curr = curr[1]

    def pop(self, last=True):
        if not self:
            raise KeyError('set is empty')
        key = self.end[1][0] if last else self.end[2][0]
        self.discard(key)
        return key

    def __repr__(self):
        if not self:
            return '%s()' % (self.__class__.__name__,)
        return '%s(%r)' % (self.__class__.__name__, list(self))

    def __eq__(self, other):
        if isinstance(other, OrderedSet):
            return len(self) == len(other) and list(self) == list(other)
        return set(self) == set(other)

In [57]:
conn = sqlite3.connect("Dane/wikipedyjka.db")

In [58]:
def to_lemma_mapping():
    all_lemmas = dd(list)
    for line in open('Dane/polimorfologik-2.1.txt', encoding='utf-8'):
        L = line.split(';')[:2]
        all_lemmas[L[1].lower()].append(L[0].lower())
    return all_lemmas
lemma_mapping = to_lemma_mapping()

In [59]:
wiki_list = []
key = ""
with open("Dane/fp_wiki.txt") as f:
    cntr = -1
    for line in f:
        tokenized_line = line.split(": ")
        if tokenized_line[0] == "TITLE":
            if tokenized_line[-1][-1] == "\n":
                tokenized_line[-1] = tokenized_line[-1][:-1]
            cntr+=1
            wiki_list.append((tokenized_line[-1], []))
            continue
        wiki_list[-1][1].append(line)

In [60]:
cntr = 0
positional_index = dd(list)
articles_pos = []
for title, article in wiki_list:
    cntr += 1
    articles_pos.append(cntr)
    for line in article:
        tokenized_line = nltk.word_tokenize(line)
        for w in tokenized_line:
            w = w.lower()
            for lemma in lemma_mapping[w]:
                positional_index[w].append(cntr)
                positional_index[lemma].append(cntr)
            cntr += 1

In [None]:
c = conn.cursor()
for k, v in positional_index.items():
    if not k.isalnum():
        continue
    c.execute(f'''DROP TABLE IF EXISTS tab_{k}_''')
    c.execute(f'''CREATE TABLE tab_{k}_
             (position INTEGER)''')
    for ind in v:
        c.execute(f"INSERT INTO tab_{k}_ VALUES ({ind})")
conn.commit()

In [None]:
c.execute(f'''DROP TABLE IF EXISTS articles_tab''')
c.execute(f'''CREATE TABLE articles_tab
             (position INTEGER)''')
for ind in articles_pos:
    c.execute(f"INSERT INTO articles_tab VALUES ({ind})")

In [80]:
def word_query(w, i = 0):
    res = OrderedSet()
    try:
        for el in c.execute(f'SELECT * FROM tab_{w}_'):
            res.add(el[0] - i)
    except:
        pass
    for lemma in lemma_mapping[w]:
        try:
            for el in c.execute(f'SELECT * FROM tab_{lemma}_'):
                res.add(el[0] - i)
        except:
            pass
    return res

In [None]:
def phrase_query(tokens):
    res = OrderedSet()
    for i, t in enumerate(tokens):
        if i == 0:
            res |= word_query(t)
            continue
        res &= word_query(t, i)
    return res

In [None]:
def binary_search(array, target):
    lower = 0
    prev_lower = 0
    upper = len(array)
    while lower < upper:   # use < instead of <=
        prev_lower = lower
        x = lower + (upper - lower) // 2
        val = array[x]
        if target == val:
            return x
        elif target > val:
            if lower == x:   # these two are the actual lines
                break        # you're looking for
            lower = x
        elif target < val:
            upper = x
    return prev_lower

In [None]:
wiki_list[binary_search(articles_pos, 35048823)]

In [78]:
def to_lemmas(tokens):
    res = []
    for t in tokens:
        t = t.lower()
        res.append(t)
        res += lemma_mapping[t]
    return res

def in_lemmas(w, lemmas):
    w = w.lower()
    if w in lemmas:
        return True
    for l in lemma_mapping[w]:
        if l in lemmas:
            return True
    return False


def search_and_print(query):
    tokens = [t.lower() for t in nltk.word_tokenize(query)]
    article_set = phrase_query(tokens)
    lemmas = to_lemmas(tokens)
    articles = list(set([binary_search(articles_pos, el) for el in article_set]))
    print(articles)
    for el in articles:
        title, article = wiki_list[el]
        for w in nltk.word_tokenize(title):
            if in_lemmas(w, lemmas):
                print(colored(w, "green"), end=" ")
            else:
                print(w, end=" ")
        print("\n")
        for line in article:
            for w in nltk.word_tokenize(line):
                if in_lemmas(w, lemmas):
                    print(colored(w, "green"), end=" ")
                else:
                    print(w, end=" ")
            print("")
        print("")

In [81]:
search_and_print("ciągła zmienna losowa")

[901961, 364059]
Funkcja osobliwa 

Funkcja osobliwa 
Funkcja osobliwa ( określana również jako ) – dowolna funkcja ƒ ( `` x `` ) , określona dla przedziału [ `` a `` , `` b `` ] , posiadająca następujące właściwości : 
Klasycznym przykładem funkcji osobliwej jest funkcja Cantora , nazywana czasami diabelskimi schodami . Istnieją jednak również inne funkcje tak nazywane . Jedna z nich jest określona przez odwzorowanie koliste . 
Jeśli ƒ ( `` x `` ) = 0 dla wszystkich `` x `` ≤ `` a `` oraz ƒ ( `` x `` ) = 1 dla wszystkich `` x `` ≥ `` b `` , to można założyć , że dana funkcja przedstawia dystrybuantę dla [32mzmiennej[0m [32mlosowej[0m , która ani nie jest cząstkową [32mzmienną[0m [32mlosową[0m ( gdyż prawdopodobieństwo wynosi zero w każdym punkcie ) ani absolutnie [32mciągłą[0m [32mzmienną[0m [32mlosową[0m ( gdyż gęstość prawdopodobieństwa jest zerowa wszędzie , gdzie jest określona ) . 


Dyskretyzacja ( statystyka ) 

Dyskretyzacja ( statystyka ) 
Dyskretyzacja – przeks