forked from daniellan2002/Search-Engine-Project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
BooleanQuery.py
78 lines (53 loc) · 1.87 KB
/
BooleanQuery.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import sys
import tokenizer
import index
def get_query_cli() -> list:
'''
Can only take words or numbers without punctuations
-> Returns a list of tokens from the CLI input
'''
try:
file_path = sys.argv[1:]
except:
print("Unexpected Input: Aborting...")
return file_path
def boolean_search(user_input, index_manager) -> list:
"""
Parameters
----------
user_input: str
index_manager: index.IndexManager
Returns
-------
list
a list of strings, each string being an URL
"""
intersection_ids = []
# 2.Parse the string into tokens
tokens = parse_input(user_input)
if len(tokens) == 0:
return []
postings_mapping = []
# 3.Create the inverted index
for each in tokens:
# Map each token with a list of doc ids -> (token, [ids])
postings_mapping.append((each, [each[0] for each in index_manager.get_postings(each)]))
# Sort the inverted index by the length of doc id list
sorted_mapping = sorted(postings_mapping, key=lambda n: n[1])
intersection_ids = set(sorted_mapping[0][1])
for each in sorted_mapping:
intersection_ids = intersection_ids & set(each[1])
return list(index_manager.get_url(d_id) for d_id in intersection_ids)
def parse_input(user_input: str) -> list:
all_tokens = tokenizer.tokenize(user_input)
unique_tokens = tokenizer.computeWordFrequencies(all_tokens).keys()
return list(unique_tokens)
if __name__ == '__main__':
# 1.Get the query as a string
user_query = get_query_cli()
user_query_custom = "This this this string's filled with punctuations."
print("Initializing index... ", end="")
index_manager = index.IndexManager(root="./storage")
print("done")
print(boolean_search(user_query, index_manager))
print(boolean_search(user_query_custom, index_manager))