## Queries

In [1]:
import os
import re
import matplotlib.pyplot as plt
import numpy as np
import random

### Generate simple queries

Queries of the form:
    
* $term_1 \ AND/OR \ term_2$ which runs in $O(n+m)$, being $n$ the length of the posting list of $term_1$ and $m$ the length of the posting list of $term_2$


* $NOT term$

In [2]:
all_words = []
path = 'data/words.txt'
with open(path, 'r') as f:
    text = f.read()
    words = text.split()
    for w in words:
        all_words.append(w)
#remove duplicates
all_words = list(set(all_words))

In [3]:
stopwords = 'data/stopwords'
with open(stopwords, 'r') as f:
    stop = f.read()
    stops = stop.split()
for s in stops:
    if(s in all_words):
        all_words.remove(s)

In [4]:
pairs = []
for i in range(1000):
    current_pair = []
    word1 = random.choice(all_words)
    current_pair.append(word1)
    word2 = random.choice(all_words)
    while (word2 == word1):
        word2 = random.choice(all_words)
    current_pair.append(word2)
    pairs.append(current_pair)

In [5]:
operators = ["AND", "OR"]
for i in pairs:
    op = random.choice(operators)
    i.insert(1, op)

In [6]:
not_pairs = []
for i in range(150):
    word1 = random.choice(all_words)
    not_pairs.append(["NOT", word1])

#remove duplicates
not_pairs = list(set(tuple(i) for i in not_pairs))

In [7]:
strings = []
for i in pairs:
    string = ' '.join(map(str, i))
    strings.append(string)

not_strings = []
for i in not_pairs:
    string = ' '.join(map(str,i))
    not_strings.append(string)

In [8]:
for i in not_strings:
    strings.append(i)

In [9]:
with open('data/simple_queries.txt', 'w') as f:
    for s in strings:
        f.write("%s\n" % s)

### Complex queries

In [10]:
ops = ["AND", "OR", "AND NOT", "OR NOT"]

def n_distinct_words(n, all_words):
    words = []
    for i in range(200):
        s = set()
        while(len(s)!=n):
            s.add(random.choice(all_words))
        words.append(list(s))
    return words

1. Queries of the form $a \ op_1 \ (b \ op_2 \ c)$

In [11]:
words_3 = n_distinct_words(3, all_words)
queries_1 = []
for w in words_3:
    current_query = []
    current_query.append(w[0])
    current_query.append(random.choice(ops))
    current_query.append("(")
    current_query.append(w[1])
    current_query.append(random.choice(ops))
    current_query.append(w[2])
    current_query.append(")")
    queries_1.append(current_query)

form_1 = []
for q in queries_1:
    string = ' '.join(map(str,q))
    form_1.append(string)

2. Queries of the form $NOT \ a \ op_1 (b \ op_2 \ c)$

In [12]:
queries_2 = []
for w in words_3:
    current_query = []
    current_query.append("NOT")
    current_query.append(w[0])
    current_query.append(random.choice(ops))
    current_query.append("(")
    current_query.append(w[1])
    current_query.append(random.choice(ops))
    current_query.append(w[2])
    current_query.append(")")
    queries_2.append(current_query)

form_2 = []
for q in queries_2:
    string = ' '.join(map(str,q))
    form_2.append(string)

3. Queries of the form $(a \ op_1 \ b) \ op_2 \ (c \ op_3 \ d)$

In [13]:
words_4 = n_distinct_words(4, all_words)
queries_3 = []
for w in words_4:
    current_query = []
    current_query.append("(")
    current_query.append(w[0])
    current_query.append(random.choice(ops))
    current_query.append(w[1])
    current_query.append(")")
    current_query.append(random.choice(ops))
    current_query.append("(")
    current_query.append(w[2])
    current_query.append(random.choice(ops))
    current_query.append(w[3])
    current_query.append(")")
    queries_3.append(current_query)

form_3 = []
for q in queries_3:
    string = ' '.join(map(str, q))
    form_3.append(string)

4. Queries of the form $a \ op_1 \ ( b \ op_2 \ (c \ op_3 \ d))$

In [14]:
queries_4 = []
for w in words_4:
    current_query = []
    current_query.append(w[0])
    current_query.append(random.choice(ops))
    current_query.append("(")
    current_query.append(w[1])
    current_query.append(random.choice(ops))
    current_query.append("(")
    current_query.append(w[2])
    current_query.append(random.choice(ops))
    current_query.append(w[3])
    current_query.append(")")
    current_query.append(")")
    queries_4.append(current_query)

form_4 = []
for q in queries_4:
    string = ' '.join(map(str,q))
    form_4.append(string)

5. Queries of the form $(a \ op_1 \ (b \ op_2 c )) \ op_3 \ ((d \ op_4 \ e) \ op_5 \ f)$

In [15]:
words_6 = n_distinct_words(6, all_words)
queries_5 = []
for w in words_6:
    current_query = []
    current_query.append("(")
    current_query.append(w[0])
    current_query.append(random.choice(ops))
    current_query.append("(")
    current_query.append(w[1])
    current_query.append(random.choice(ops))
    current_query.append(w[2])
    current_query.append(")")
    current_query.append(")")
    current_query.append(random.choice(ops))
    current_query.append("(")
    current_query.append("(")
    current_query.append(w[3])
    current_query.append(random.choice(ops))
    current_query.append(w[4])
    current_query.append(")")
    current_query.append(random.choice(ops))
    current_query.append(w[5])
    current_query.append(")")
    queries_5.append(current_query)
    
form_5 = []
for q in queries_5:
    string = ' '.join(map(str,q))
    form_5.append(string)

In [16]:
all_queries = form_1 + form_2 + form_3 + form_4 + form_5
random.shuffle(all_queries)

In [17]:
with open('data/complex_queries.txt', 'w') as f:
    for s in all_queries:
        f.write("%s\n" % s)