/
searchText_for_nodejs.py
82 lines (67 loc) · 2.75 KB
/
searchText_for_nodejs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# -*- coding: UTF-8 -*-
from whoosh.filedb.filestore import FileStorage
from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh.qparser import MultifieldParser
import whoosh.qparser as qparser
import chinese
import os, glob, codecs, sys
analyzer = chinese.ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True, analyzer=analyzer),
sub_title=TEXT(stored=True, analyzer=analyzer),
author=TEXT(stored=True, analyzer=analyzer),
content=TEXT(stored=True, analyzer=analyzer))
storage = FileStorage("indexdir")
ix = storage.open_index()
writer = ix.writer()
_string = sys.argv[1]
_mode = sys.argv[2]
normal = (_mode == "normal")
_distance = 0
if(normal is False):
_distance = int(sys.argv[3])
with ix.searcher() as searcher:
# og = qparser.OrGroup.factory(0.9)
parser = MultifieldParser(["title", "sub_title", "author", "content"], schema=ix.schema)
# parser = qparser.QueryParser("content", ix.schema)
parser.remove_plugin_class(qparser.PhrasePlugin)
parser.add_plugin(qparser.SequencePlugin())
if (normal):
string = _string
query = parser.parse(string)
else:
# proximity
distance = _distance
proximty_query = "\"" + _string + "\"" + '~' + str((1+distance)*3)
query = parser.parse(proximty_query)
# sys.stdout.buffer.write(query)
sys.stdout.buffer.write(">>>>>>OUTPUT start<<<<<<".encode('utf-8'))
results = searcher.search(query, limit=20)
results.fragmenter.maxchars = 100
# Show more context before and after
results.fragmenter.surround = 40
for result in results:
sys.stdout.buffer.write(">>>>>> data start:".encode('utf-8'))
sys.stdout.buffer.write(">>>>>> title start:".encode('utf-8'))
txt = result["title"].encode('utf-8')
sys.stdout.buffer.write(txt)
sys.stdout.buffer.write(">>>>>> title end:".encode('utf-8'))
sys.stdout.buffer.write(">>>>>> sub_title start:".encode('utf-8'))
txt = result["sub_title"].encode('utf-8')
sys.stdout.buffer.write(txt)
sys.stdout.buffer.write(">>>>>> sub_title end:".encode('utf-8'))
sys.stdout.buffer.write(">>>>>> author start:".encode('utf-8'))
txt = result["author"].encode('utf-8')
sys.stdout.buffer.write(txt)
sys.stdout.buffer.write(">>>>>> author end:".encode('utf-8'))
sys.stdout.buffer.write(">>>>>> content start:".encode('utf-8'))
txt = result["content"].encode('utf-8')
sys.stdout.buffer.write(txt)
sys.stdout.buffer.write(">>>>>> content end:".encode('utf-8'))
sys.stdout.buffer.write(">>>>>> highlights start:".encode('utf-8'))
txt = result.highlights("content").encode('utf-8')
sys.stdout.buffer.write(txt)
sys.stdout.buffer.write(">>>>>> highlights end:".encode('utf-8'))
sys.stdout.buffer.write(">>>>>> data end:".encode('utf-8'))
sys.stdout.buffer.write(">>>>>>OUTPUT end<<<<<<".encode('utf-8'))