# Creation


In [1]:
documentDir = "test"

### 1. Tokenization

In [2]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from collections import defaultdict, Counter
from string import punctuation
import os

tokenizer = TweetTokenizer()

def tokenize_document(content):
    
    sentences = sent_tokenize(content)
    tokens = []
    for _sent in sentences:
        sent_tokens = tokenizer.tokenize(_sent)
        sent_tokens = [_tok.lower() for _tok in sent_tokens if _tok not in punctuation]
        tokens += sent_tokens
    
    return tokens

def tokenize_document_with_sentences(content):

    sentences = sent_tokenize(content)
    tokens = []
    for _sent in sentences:
        sent_tokens = tokenizer.tokenize(_sent)
        sent_tokens = [_tok.lower() for _tok in sent_tokens if _tok not in punctuation.replace(".","")]
        tokens += sent_tokens
    
    return tokens

In [3]:
print(punctuation.replace(".",""))
print(punctuation)

!"#$%&'()*+,-/:;<=>?@[\]^_`{|}~
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


### 2. Linguistic modules


Here there are no linguistic modules yet
Possible are:
- stemmer
- lemmatizer 

In [4]:
def preprocess_document(content):
    return tokenize_document_with_sentences(content)

In [5]:
def prepare_dataset(documents_dir):

    tokenized_documents = []
    for document in os.listdir(documents_dir):
        with open(os.path.join(documents_dir, document), errors='ignore',encoding='utf8') as outf:
            tokenized_documents.append(preprocess_document(outf.read()))
    print("Found documents: ", len(tokenized_documents))
    
    return tokenized_documents 

In [6]:

tokenized_documents = prepare_dataset(documentDir)
print(tokenized_documents)

Found documents:  3
[['.', 'one', '.', 'one'], ['two', 'one', 'two', 'two'], ['two', '.', 'tree']]


### 3. Indexer

In [7]:
from os import scandir # can be used for easier iteration of documents in a folder
# can check is_file() on the objects returned by scan_dir 
# contain whole document path, so no need to join with the directory

def get_document_tokens(document_tokens,doc_id):
    #res = [(token, i) for token in document_tokens]
    res = []
    sentence_id = 0
    for position,token in enumerate(document_tokens):
        if token == ".":
            sentence_id=sentence_id+1
        else:
            res.append(((token,position,sentence_id),doc_id))

   # res=[((token,i),doc_id) for i,token in enumerate(document_tokens)]
    return res

def get_token_doc_id_pairs(category_dir):

    token_docid = []
    doc_ids = {}

    for i, document in enumerate(scandir(category_dir)):
        if document.is_file():
            doc_ids[i] = document.name
            with open(document,encoding='utf8') as out_fp:
                document_tokens = preprocess_document(out_fp.read())
                token_docid += get_document_tokens(document_tokens,i)
    return token_docid, doc_ids

In [8]:
token_docid, doc_ids = get_token_doc_id_pairs(documentDir)
print(doc_ids)
token_docid

{0: 'one.txt', 1: 'one_two.txt', 2: 'two_tree.txt'}


[(('one', 1, 1), 0),
 (('one', 3, 2), 0),
 (('two', 0, 0), 1),
 (('one', 1, 0), 1),
 (('two', 2, 0), 1),
 (('two', 3, 0), 1),
 (('two', 0, 0), 2),
 (('tree', 2, 1), 2)]

Sort by tokens to form the dictionary?

In [9]:
from operator import itemgetter
sorted_token_docid = sorted(token_docid, key=lambda el: el[0][0])
sorted_token_docid

[(('one', 1, 1), 0),
 (('one', 3, 2), 0),
 (('one', 1, 0), 1),
 (('tree', 2, 1), 2),
 (('two', 0, 0), 1),
 (('two', 2, 0), 1),
 (('two', 3, 0), 1),
 (('two', 0, 0), 2)]

In [10]:
def merge_token_in_doc(sorted_token_docid):
    """
    Returns a list of (token, doc_id, term_freq) tuples from a sorted list of (token, doc_id) list, 
    where if a token appears n times in a doc_id, we merge it in a tuple (toke, doc_id, n).
    """
    merged_tokens_in_doc = []
    for combined_token, doc_id in sorted_token_docid:
        (token,position,sentence) = combined_token
        if merged_tokens_in_doc:
            prev_tok, prev_doc_id, prev_freq,prev_positions,prev_sentences = merged_tokens_in_doc[-1]
            if prev_tok == token and prev_doc_id == doc_id:     
                merged_tokens_in_doc[-1] = (token, doc_id, prev_freq+1,prev_positions + [position],prev_sentences+[sentence])
            else:
                merged_tokens_in_doc.append((token, doc_id, 1,[position],[sentence]))
        else:
            merged_tokens_in_doc.append((token, doc_id, 1,[position],[sentence]))
    return merged_tokens_in_doc

In [11]:
merged_tokens_in_doc = merge_token_in_doc(sorted_token_docid)
merged_tokens_in_doc

[('one', 0, 2, [1, 3], [1, 2]),
 ('one', 1, 1, [1], [0]),
 ('tree', 2, 1, [2], [1]),
 ('two', 1, 3, [0, 2, 3], [0, 0, 0]),
 ('two', 2, 1, [0], [0])]

In [12]:
from collections import defaultdict
dictionary = defaultdict(lambda: (0, 0)) # term : doc_freq, tot freq
postings = defaultdict(lambda: []) # term: doc_ids, doc_freq

for token, doc_id, doc_freq,positions,sentences in merged_tokens_in_doc:
    dictionary[token] = (dictionary[token][0]+1, dictionary[token][1
    ]+doc_freq)

# usually implemented as linked lists
for token, doc_id, doc_freq,positions,sentences in merged_tokens_in_doc:
    postings[token].append((doc_id, doc_freq,positions,sentences)) 

In [13]:
doc_ids

{0: 'one.txt', 1: 'one_two.txt', 2: 'two_tree.txt'}

In [14]:
dictionary["one"],dictionary['two'],dictionary['tree'],dictionary['zero']

((2, 3), (2, 4), (1, 1), (0, 0))

In [15]:
print(postings["one"])
print(postings['two'])
print(postings['tree'])
print(postings['zero'])

[(0, 2, [1, 3], [1, 2]), (1, 1, [1], [0])]
[(1, 3, [0, 2, 3], [0, 0, 0]), (2, 1, [0], [0])]
[(2, 1, [2], [1])]
[]


# Search

### 1. Opearations functions

In [16]:
def and_query(postings_word1, postings_word2):
 
    documents_results = []
    
    postings_ind1, postings_ind2 = 0, 0
    while postings_ind1 < len(postings_word1) and postings_ind2 < len(postings_word2):
        doc_id1, doc_id2 = postings_word1[postings_ind1][0], postings_word2[postings_ind2][0]
        if doc_id1 == doc_id2:
            documents_results.append((doc_id1,0))
            postings_ind1 += 1
            postings_ind2 += 1
        elif doc_id1 < doc_id2:
            postings_ind1 += 1
        elif doc_id1 > doc_id2:
            postings_ind2 += 1
    return documents_results

def and_multipar(self,lists) -> list:
    prev = lists[0]
    for i in range(0,len(lists)-1):
        prev = and_query(prev,lists[i+1])
    return prev

In [17]:
def or_query(postings_word1, postings_word2):

    documents_results = []
    
    postings_ind1, postings_ind2 = 0, 0
    while postings_ind1 < len(postings_word1) and postings_ind2 < len(postings_word2):
        doc_id1, doc_id2 = postings_word1[postings_ind1][0], postings_word2[postings_ind2][0]
        if doc_id1 == doc_id2:
            documents_results.append((doc_id1,0))
            postings_ind1 += 1
            postings_ind2 += 1
        elif doc_id1 < doc_id2:
            documents_results.append((doc_id1,0))
            postings_ind1 += 1
        elif doc_id1 > doc_id2:
            documents_results.append((doc_id2,0))
            postings_ind2 += 1
    if postings_ind1 == len(postings_word1):
        for i in range(postings_ind2,len(postings_word2)):
            documents_results.append((postings_word2[i][0],0))
    if postings_ind2 == len(postings_word2):
        for i in range(postings_ind1,len(postings_word1)):
            documents_results.append((postings_word1[i][0],0))
    return documents_results

def or_multipar(self,lists) -> list:
    prev = lists[0]
    for i in range(0,len(lists)-1):
        prev = or_query(prev,lists[i+1])
    return prev

In [18]:
def not_query(postings_word):
    document_count = len(tokenized_documents)
    documents_results = []

    prev = 0
    for i in range(0,len(postings_word)):
        for not_doc in range(prev,postings_word[i][0]):
            documents_results.append((not_doc,0))
        prev = postings_word[i][0]+1
    for not_doc in range(prev,document_count):
        documents_results.append((not_doc,0))
    
    return documents_results
        

In [19]:
print(not_query(postings["one"]))
print(not_query(postings["two"]))
print(not_query(postings["tree"]))

[(2, 0)]
[(0, 0)]
[(0, 0), (1, 0)]


### 2. Parsing classes

In [20]:

from typing import Callable, Iterable

class BoolRetrievalOperand:
    def __init__(self, t):
        self.label = t[0]
        print("Creating BoolRetrievalOperand" + str(t))
    
    def process(self) -> list:
        print("Processing BoolRetrievalOperand "+ self.label)
        self.value = postings[self.label]
        print(self.value)
        return self.value

    def __str__(self) -> str:
        return self.label

    __repr__ = __str__


In [21]:
class BoolRetrievalNot:
    def __init__(self, t):
        print("Creating BoolRetrievalNot"+str(t))
        self.arg = t[0][1]
        print(self.arg)


    def process(self) -> list:
        res = not_query(self.arg.process())
        print("Processing "+str(self))
        print(res)
        return res

    def __str__(self) -> str:
        return "~" + str(self.arg)

    __repr__ = __str__


In [22]:
class BoolRetrievalBinOp:
    repr_symbol: str = ""
    eval_fn: Callable[
        [list[list]], list
    ] = lambda _: []

    def __init__(self, t):
        print("Creating BoolRetrievalBinOp "+self.repr_symbol)
        print(t)
        self.args = t[0][0::2]

    def __str__(self) -> str:
        sep = " %s " % self.repr_symbol
        return "(" + sep.join(map(str, self.args)) + ")"


    def process(self) -> list:
        res = self.eval_fn([a.process() for a in self.args])
        print("Processing "+str(self))
        print(res)
        return res

    __repr__ = __str__


class BoolRetrievalAnd(BoolRetrievalBinOp):
    repr_symbol = "&"
    eval_fn = and_multipar


class BoolRetrievalOr(BoolRetrievalBinOp):
    repr_symbol = "|"
    eval_fn = or_multipar


### 3. Parsing grammar

In [23]:

from pyparsing import infixNotation, opAssoc, Keyword, Word, alphas, ParserElement,pyparsing_unicode, nums

NOT = Keyword("not")
AND = Keyword("and")
OR = Keyword("or")
token = Word(pyparsing_unicode.printables,exclude_chars=punctuation)
token.setParseAction(BoolRetrievalOperand).setName("token")

boolOperand = token
boolOperand.setName("bool_operand")

# define expression, based on expression operand and
# list of operations in precedence order
boolExpr = infixNotation(
    boolOperand,
    [
        (NOT, 1, opAssoc.RIGHT,BoolRetrievalNot),
        (AND, 2, opAssoc.LEFT,BoolRetrievalAnd),
        (OR, 2, opAssoc.LEFT,BoolRetrievalOr),
    ],
).setName("boolean_expression")


### 4. Testing

In [24]:
tests = [
    ("not not one",True),
    ("one", True),
    ("tree", True),
    ("one and tree",True),
    ("one and two",True),
    ("two and two",True),
    ("two and (two and one)",True),
    ("one and tree and two",True),
    ("one or tree",True),
    ("(one or tree) and two",True),
    ("not ((one or tree) and two)",True),
]


for test_string, expected in tests:
    res = boolExpr.parseString(test_string)[0]
    success = "test"#"PASS" if bool(res) == expected else "FAIL"
    print("Query: "+test_string, "\n", res, "=", str(res.process()), "\n", success, "\n")

Creating BoolRetrievalOperand['one']
Creating BoolRetrievalNot[['not', one]]
one
Creating BoolRetrievalNot[['not', ~one]]
~one
Processing BoolRetrievalOperand one
[(0, 2, [1, 3], [1, 2]), (1, 1, [1], [0])]
Processing ~one
[(2, 0)]
Processing ~~one
[(0, 0), (1, 0)]
Query: not not one 
 ~~one = [(0, 0), (1, 0)] 
 test 

Creating BoolRetrievalOperand['one']
Processing BoolRetrievalOperand one
[(0, 2, [1, 3], [1, 2]), (1, 1, [1], [0])]
Query: one 
 one = [(0, 2, [1, 3], [1, 2]), (1, 1, [1], [0])] 
 test 

Creating BoolRetrievalOperand['tree']
Processing BoolRetrievalOperand tree
[(2, 1, [2], [1])]
Query: tree 
 tree = [(2, 1, [2], [1])] 
 test 

Creating BoolRetrievalOperand['one']
Creating BoolRetrievalOperand['tree']
Creating BoolRetrievalBinOp &
[[one, 'and', tree]]
Processing BoolRetrievalOperand one
[(0, 2, [1, 3], [1, 2]), (1, 1, [1], [0])]
Processing BoolRetrievalOperand tree
[(2, 1, [2], [1])]
Processing (one & tree)
[]
Query: one and tree 
 (one & tree) = [] 
 test 

Creating Bool

In [25]:
def get_wildcard_words(word):
    return ["one","two"]

In [26]:
def woldcard_process(word):
    wildcard_words = get_wildcard_words(word)
    print(wildcard_words[0])
    prew = postings[wildcard_words[0]]
    for i in range(1,len(wildcard_words)):
        prew = or_query(prew,postings[wildcard_words[i]])
    return prew

In [27]:

class BoolRetrievalWildcard:
    def __init__(self, t):
        self.label = t[0]
        print("Creating BoolRetrievalWildcard" + str(t))
    
    def process(self) -> list:
        print("Processing BoolRetrievalWildcard " + str(self))
        self.value = woldcard_process(self.label)
        print(self.value)
        return [self.value]

    def __str__(self) -> str:
        return self.label+"*"

    __repr__ = __str__

class BoolRetrievalProximity:
    def __init__(self, t):
        self.A = t[0]
        self.positions = t[2]
        self.B = t[3]
        print("Creating BoolRetrievalProximity" + str(t))
    
    def process(self) -> list:
        print("Processing BoolRetrievalProximity " + self.A + " \\" + self.positions + " " + self.B)
       # self.value = postings[ self.label]
      #  print(self.value)
        return []

    def __str__(self) -> str:
        return "" + self.A + " /" + self.positions + " " + self.B

    __repr__ = __str__

def process_sentence(words):
    common_docs = and_multipar("",words)
    print(common_docs)
    return common_docs

class BoolRetrievalSentence:
    def __init__(self, t):
        self.words = t[1:]
        print("Creating BoolRetrievalSentence" + str(t))
    
    def process(self) -> list:
        print("Processing BoolRetrievalSentence " + str(self.words))
       # self.value = postings[ self.label]

        print(self.words)
        return process_sentence(self.words)

    def __str__(self) -> str:
        return "/s r"+str(self.words)

    __repr__ = __str__


token = Word(alphas)
token.setParseAction(BoolRetrievalOperand).setName("token")

wildcard =  Word(alphas) + "*"
wildcard.setParseAction(BoolRetrievalWildcard).setName("wildcard")

#   Word(alphas) + ("/" + Word(nums) + Word(alphas))[1,...]
proximity = Word(alphas) + "/" + Word(nums) + Word(alphas)
proximity.setParseAction(BoolRetrievalProximity).setName("proximity")

sentence = "/s" + Word(alphas)[1,...]
sentence.setParseAction(BoolRetrievalSentence).setName("proximity")

boolOperand = sentence | proximity | wildcard | token

# define expression, based on expression operand and
# list of operations in precedence order
boolExpr = infixNotation(
    boolOperand,
    [
        (NOT, 1, opAssoc.RIGHT,BoolRetrievalNot),
        (AND, 2, opAssoc.LEFT,BoolRetrievalAnd),
        (OR, 2, opAssoc.LEFT,BoolRetrievalOr),
    ],
)

tests = [
    ("asdasda*",True),
    ("one", True),
    ("tree", True),
    ("one /4 tree", True),
    ("one /4 tree /5 five", True),
    ("/s one tree five six", True),
   # ("one or tree*",True),
    ("(one* and (/s one two)) or (will /3 be)",True),
]


for test_string, expected in tests:
    res = boolExpr.parseString(test_string)[0]
    success = "test"#"PASS" if bool(res) == expected else "FAIL"
    print("Query: "+test_string, "\n", res, "=", str(res.process()), "\n", success, "\n")


Creating BoolRetrievalWildcard['asdasda', '*']
Processing BoolRetrievalWildcard asdasda*
one
[(0, 0), (1, 0), (2, 0)]
Query: asdasda* 
 asdasda* = [[(0, 0), (1, 0), (2, 0)]] 
 test 

Creating BoolRetrievalOperand['one']
Processing BoolRetrievalOperand one
[(0, 2, [1, 3], [1, 2]), (1, 1, [1], [0])]
Query: one 
 one = [(0, 2, [1, 3], [1, 2]), (1, 1, [1], [0])] 
 test 

Creating BoolRetrievalOperand['tree']
Processing BoolRetrievalOperand tree
[(2, 1, [2], [1])]
Query: tree 
 tree = [(2, 1, [2], [1])] 
 test 

Creating BoolRetrievalProximity['one', '/', '4', 'tree']
Processing BoolRetrievalProximity one \4 tree
Query: one /4 tree 
 one /4 tree = [] 
 test 

Creating BoolRetrievalProximity['one', '/', '4', 'tree']
Processing BoolRetrievalProximity one \4 tree
Query: one /4 tree /5 five 
 one /4 tree = [] 
 test 

Creating BoolRetrievalSentence['/s', 'one', 'tree', 'five', 'six']
Processing BoolRetrievalSentence ['one', 'tree', 'five', 'six']
['one', 'tree', 'five', 'six']
[]
Query: /s one 