In [None]:
import numpy as np
from collections import defaultdict as dd
from typing import List, Dict, Set
import nltk
from tqdm import tqdm_notebook, tnrange
import pickle
from termcolor import colored
import sqlite3

In [None]:
import collections

class OrderedSet(collections.MutableSet):

    def __init__(self, iterable=None):
        self.end = end = [] 
        end += [None, end, end]         # sentinel node for doubly linked list
        self.map = {}                   # key --> [key, prev, next]
        if iterable is not None:
            self |= iterable

    def __len__(self):
        return len(self.map)

    def __contains__(self, key):
        return key in self.map

    def add(self, key):
        if key not in self.map:
            end = self.end
            curr = end[1]
            curr[2] = end[1] = self.map[key] = [key, curr, end]

    def discard(self, key):
        if key in self.map:        
            key, prev, next = self.map.pop(key)
            prev[2] = next
            next[1] = prev

    def __iter__(self):
        end = self.end
        curr = end[2]
        while curr is not end:
            yield curr[0]
            curr = curr[2]

    def __reversed__(self):
        end = self.end
        curr = end[1]
        while curr is not end:
            yield curr[0]
            curr = curr[1]

    def pop(self, last=True):
        if not self:
            raise KeyError('set is empty')
        key = self.end[1][0] if last else self.end[2][0]
        self.discard(key)
        return key

    def __repr__(self):
        if not self:
            return '%s()' % (self.__class__.__name__,)
        return '%s(%r)' % (self.__class__.__name__, list(self))

    def __eq__(self, other):
        if isinstance(other, OrderedSet):
            return len(self) == len(other) and list(self) == list(other)
        return set(self) == set(other)

In [None]:
conn = sqlite3.connect("Dane/wikipedyjka2.db")

In [None]:
def to_lemma_mapping():
    all_lemmas = dd(list)
    for line in open('Dane/polimorfologik-2.1.txt', encoding='utf-8'):
        L = line.split(';')[:2]
        all_lemmas[L[1].lower()].append(L[0].lower())
    return all_lemmas
lemma_mapping = to_lemma_mapping()

In [54]:
wiki_list = []
key = ""
with open("Dane/fp_wiki.txt") as f:
    cntr = -1
    for line in f:
        tokenized_line = line.split(": ")
        if tokenized_line[0] == "TITLE":
            if tokenized_line[-1][-1] == "\n":
                tokenized_line[-1] = tokenized_line[-1][:-1]
            cntr+=1
            wiki_list.append((tokenized_line[-1], []))
            continue
        wiki_list[-1][1].append(line)

In [55]:
cntr1 = 0
title_positional_index = dd(list)
article_positional_index = dd(list)
title_lemma_positional_index = dd(list)
article_lemma_positional_index = dd(list)

for title, article in wiki_list:
    tokenized_line = nltk.word_tokenize(title)
    cntr2 = 0
    for w in tokenized_line:
        w = w.lower()
        title_positional_index[w].append((cntr1, cntr2))
        for lemma in lemma_mapping[w]:
            title_lemma_positional_index[lemma].append((cntr1, cntr2))
        cntr2 +=1
    cntr2 = 0
    for line in article:
        tokenized_line = nltk.word_tokenize(line)
        for w in tokenized_line:
            w = w.lower()
            article_positional_index[w].append((cntr1, cntr2))
            for lemma in lemma_mapping[w]:
                article_lemma_positional_index[lemma].append((cntr1, cntr2))
            cntr2 += 1
    cntr1 += 1

In [None]:
c = conn.cursor()
for k, v in title_positional_index.items():
    if not k.isalnum():
        continue
    try:
        c.execute(f'''DROP TABLE IF EXISTS tab_{k}_title''')
        c.execute(f'''CREATE TABLE tab_{k}_title
             (title_position INTEGER, word_position INTEGER)''')
        for ind in v:
            c.execute(f"INSERT INTO tab_{k}_title VALUES ({ind[0]}, {ind[1]})")
    except:
        pass
conn.commit()
for k, v in article_positional_index:
    if not k.isalnum():
        continue
    try:
        c.execute(f'''DROP TABLE IF EXISTS tab_{k}_article''')
        c.execute(f'''CREATE TABLE tab_{k}_article
                 (title_position INTEGER, word_position INTEGER)''')
        for ind in v:
            c.execute(f"INSERT INTO tab_{k}_article VALUES ({ind[0]}, {ind[1]})")
    except:
        pass
conn.commit()
for k, v in title_lemma_positional_index:
    if not k.isalnum():
        continue
    try:
        c.execute(f'''DROP TABLE IF EXISTS tab_{k}_lemma_title''')
        c.execute(f'''CREATE TABLE tab_{k}_lemma_title
                 (title_position INTEGER, word_position INTEGER)''')
        for ind in v:
            c.execute(f"INSERT INTO tab_{k}_lemma_title VALUES ({ind[0]}, {ind[1]})")
    except:
        pass
conn.commit()
for k, v in article_lemma_positional_index:
    if not k.isalnum():
        continue
    try:
        c.execute(f'''DROP TABLE IF EXISTS tab_{k}_lemma_article''')
        c.execute(f'''CREATE TABLE tab_{k}_lemma_article
                 (title_position INTEGER, word_position INTEGER)''')
        for ind in v:
            c.execute(f"INSERT INTO tab_{k}_lemma_article VALUES ({ind[0]}, {ind[1]})")
    except:
        pass
conn.commit()

In [None]:
conn.commit()

In [65]:
print(k)

11


In [None]:
def word_query_alt(w):
    w = w.lower()
    res = [OrderedSet(), OrderedSet(), OrderedSet(), OrderedSet()]
    try:
        for el in title_positional_index[w]:
            res[0].add(el[0])
    except:
        pass
    try:
        for el in article_positional_index[w]:
            res[1].add(el[0])
    except:
        pass
    for lemma in lemma_mapping[w]:
        try:
            for el in title_lemma_positional_index[w]:
                res[2].add(el[0])
        except:
            pass
        try:
            for el in article_lemma_positional_index[w]:
                res[3].add(el[0])
        except:
            pass
    res[1] |= res[0]
    res[2] |= res[1]
    res[3] |= res[2]

#     print(res)
    return res

In [89]:
def word_query(w):
    w = w.lower()
    res = [OrderedSet(), OrderedSet(), OrderedSet(), OrderedSet()]
    try:
        for el in c.execute(f'SELECT * FROM tab_{w}_title'):
            res[0].add(el[0])
    except:
        pass
    try:
        for el in c.execute(f'SELECT * FROM tab_{w}_article'):
            res[1].add(el[0])
    except:
        pass
    for lemma in lemma_mapping[w]:
        try:
            for el in c.execute(f'SELECT * FROM tab_{lemma}_lemma_title'):
                res[2].add(el[0])
        except:
            pass
        try:
            for el in c.execute(f'SELECT * FROM tab_{lemma}_lemma_article'):
                res[3].add(el[0])
        except:
            pass
    res[1] |= res[0]
    res[2] |= res[1]
    res[3] |= res[2]

#     print(res)
    return res

In [90]:
def phrase_word_query(w, i):
    w = w.lower()
    res = [OrderedSet(), OrderedSet(), OrderedSet(), OrderedSet()]
    try:
        for el in c.execute(f'SELECT * FROM tab_{w}_title'):
            res[0].add((el[0], el[1] - i))
    except:
        pass
    try:
        for el in c.execute(f'SELECT * FROM tab_{w}_article'):
            res[1].add((el[0], el[1] - i))
    except:
        pass
    for lemma in lemma_mapping[w]:
        try:
            for el in c.execute(f'SELECT * FROM tab_{lemma}_lemma_title'):
                res[2].add((el[0], el[1] - i))
        except:
            pass
        try:
            for el in c.execute(f'SELECT * FROM tab_{lemma}_lemma_article'):
                res[3].add((el[0], el[1] - i))
        except:
            pass
    res[1] |= res[0]
    res[2] |= res[1]
    res[3] |= res[2]
    return res

In [None]:
def phrase_word_query_alt(w, i):
    w = w.lower()
    res = [OrderedSet(), OrderedSet(), OrderedSet(), OrderedSet()]
    try:
        for el in title_positional_index[w]:
            res[0].add((el[0], el[1] - i))
    except:
        pass
    try:
        for el in article_positional_index[w]:
            res[1].add((el[0], el[1] - i))
    except:
        pass
    for lemma in lemma_mapping[w]:
        try:
            for el in title_lemma_positional_index[w]:
                res[2].add((el[0], el[1] - i))
        except:
            pass
        try:
            for el in article_lemma_positional_index[w]:
                res[3].add((el[0], el[1] - i))
        except:
            pass
    res[1] |= res[0]
    res[2] |= res[1]
    res[3] |= res[2]
    return res

In [91]:
def query(words):
    res = [OrderedSet(), OrderedSet(), OrderedSet(), OrderedSet()]
    for i, w in enumerate(words):
        w = w.lower()
        if i == 0:
            res1 = word_query(w)
            for j in range(4):
                res[j] |= res1[j]
            continue
        res1 = word_query(w)
        for j in range(4):
            res[j] &= res1[j]
    res = phrase_query(words) + res
    for i in range(1,8):
        for j in range(i):
            res[i] -= res[j]
    return res

In [92]:
def phrase_query(words):
    res = [OrderedSet(), OrderedSet(), OrderedSet(), OrderedSet()]
    for i, w in enumerate(words):
        w = w.lower()
        if i == 0:
            res1 = phrase_word_query(w, i)
            for j in range(4):
                res[j] |= res1[j]
            continue
        res1 = phrase_word_query(w, i)
        for j in range(4):
            res[j] &= res1[j]
    res1 = res
    res = [OrderedSet(), OrderedSet(), OrderedSet(), OrderedSet()]
    for i in range(4):
        for el in res1[i]:
            res[i].add(el[0])
    res[1] -= res[0]
    res[2] -= res[0] | res[1]
    res[3] -= res[0] | res[1] | res[2]
    return res

In [93]:
def to_lemmas(tokens):
    res = []
    for t in tokens:
        t = t.lower()
        res.append(t)
        res += lemma_mapping[t]
    return res

def in_lemmas(w, lemmas):
    w = w.lower()
    if w in lemmas:
        return True
    for l in lemma_mapping[w]:
        if l in lemmas:
            return True
    return False


def search_and_print(words):
    is_phrase = words[0] == "\""
    if is_phrase:
        words = words[1:-1]
    tokens = [t.lower() for t in nltk.word_tokenize(words)]
    article_sets = phrase_query(tokens) if is_phrase else query(tokens)
    lemmas = to_lemmas(tokens)
    for articles in article_sets:
        for el in articles:
            title, article = wiki_list[el]
            for w in nltk.word_tokenize(title):
                if in_lemmas(w, lemmas):
                    print(colored(w, "green"), end=" ")
                else:
                    print(w, end=" ")
            print("\n")
            for line in article:
                for w in nltk.word_tokenize(line):
                    if in_lemmas(w, lemmas):
                        print(colored(w, "green"), end=" ")
                    else:
                        print(w, end=" ")
                print("")
            print("")

In [88]:
print(phrase_query(["gdzie", "spadają", "anioły"]))

[OrderedSet([142904, 144706, 217530, 224912, 225556, 232112, 241795, 264899, 297171, 312466, 322199, 344328, 352551, 364502, 366502, 431377, 443190, 444386, 454455, 456995, 457557, 479521, 497695, 504927, 532489, 535170, 567409, 580103, 589441, 593781, 598143, 626817, 627196, 628082, 628902, 637343, 646220, 674781, 693514, 709152, 717983, 757099, 797260, 797970, 801165, 803027, 810814, 813638, 840147, 856247, 897691, 903595, 917405, 987861, 996769, 1006809, 1006825, 1043201, 1045513, 1057625, 1063380, 1074372, 1157406, 1166427]), OrderedSet(), OrderedSet(), OrderedSet()]


In [102]:
search_and_print("doroty terakowskiej")

In [106]:
phrase_word_query("doroty", 0)[1]

OrderedSet([(138801, 7), (218403, 3), (235461, 2), (236277, 3), (283884, 1), (389343, 3), (437310, 3), (440105, 3), (614850, 3), (634595, 3), (636835, 3), (640710, 7), (649709, 3), (652408, 3), (665922, 3), (668221, 3), (669730, 7), (710052, 3), (713852, 3), (716236, 3), (731481, 3), (741762, 3), (751005, 7), (760171, 3), (813581, 2), (816782, 5), (873831, 3), (874840, 3), (897133, 3), (914564, 3), (923389, 3), (928653, 3), (944005, 3), (947024, 5), (993638, 3), (997309, 3), (1001749, 3), (1005370, 3), (1049294, 3), (1077587, 3), (1080463, 3), (1092081, 3), (1170911, 3), (1172584, 3), (1172737, 4), (1195907, 3), (1205204, 8)])

In [107]:
phrase_word_query("doroty", 0)[0]

OrderedSet([(138801, 7), (218403, 3), (235461, 2), (236277, 3), (283884, 1), (389343, 3), (437310, 3), (440105, 3), (614850, 3), (634595, 3), (636835, 3), (640710, 7), (649709, 3), (652408, 3), (665922, 3), (668221, 3), (669730, 7), (710052, 3), (713852, 3), (716236, 3), (731481, 3), (741762, 3), (751005, 7), (760171, 3), (813581, 2), (816782, 5), (873831, 3), (874840, 3), (897133, 3), (914564, 3), (923389, 3), (928653, 3), (944005, 3), (947024, 5), (993638, 3), (997309, 3), (1001749, 3), (1005370, 3), (1049294, 3), (1077587, 3), (1080463, 3), (1092081, 3), (1170911, 3), (1172584, 3), (1172737, 4), (1195907, 3), (1205204, 8)])

In [None]:
c.execute(f'SELECT * FROM {"doroty"}_article')

In [110]:
article_positional_index["doroty"]

[(7392, 77),
 (12368, 112),
 (21125, 104),
 (22443, 52),
 (23289, 30),
 (24174, 85),
 (27537, 67),
 (33858, 84),
 (36959, 150),
 (37605, 48),
 (45237, 37),
 (47392, 111),
 (48213, 66),
 (51285, 24),
 (51481, 40),
 (51674, 97),
 (55761, 85),
 (55924, 59),
 (57459, 210),
 (57843, 46),
 (58951, 183),
 (59650, 63),
 (59890, 60),
 (61842, 15),
 (64400, 85),
 (64400, 113),
 (64403, 118),
 (64502, 86),
 (67888, 45),
 (68153, 71),
 (68153, 100),
 (70007, 125),
 (71169, 100),
 (76959, 70),
 (77739, 54),
 (79195, 169),
 (95885, 120),
 (116538, 36),
 (135752, 80),
 (136598, 244),
 (136604, 53),
 (136604, 82),
 (138801, 7),
 (138801, 23),
 (143464, 85),
 (149149, 48),
 (155462, 26),
 (156426, 30),
 (157819, 132),
 (158026, 29),
 (158526, 32),
 (162944, 41),
 (170888, 75),
 (171919, 42),
 (174801, 38),
 (180102, 105),
 (183428, 99),
 (184682, 141),
 (188887, 116),
 (193103, 81),
 (195890, 18),
 (195890, 72),
 (196478, 33),
 (199528, 165),
 (200879, 309),
 (203692, 41),
 (206152, 41),
 (208727, 99),