This repository has been archived by the owner on Nov 10, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 43
/
lucene_cmd.py
executable file
·173 lines (160 loc) · 5.89 KB
/
lucene_cmd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
from hoaxy.commands import HoaxyCommand
from hoaxy.database import Session
from hoaxy.database.functions import get_or_create_m
from hoaxy.database.models import MetaInfo
from hoaxy.utils.log import configure_logging
from hoaxy.ir.index import Indexer
from hoaxy.ir.search import Searcher
from schema import Schema, And, Use, Or
from schema import SchemaError
import lucene
import logging
import sys
import sqlalchemy
import pprint
logger = logging.getLogger(__name__)
class Lucene(HoaxyCommand):
"""
usage:
hoaxy lucene --index [--mode=<mode>]
hoaxy lucene --search --query=<q> [--top=<n>]
hoaxy lucene -h | --help
Using Apache Lucene to build index from the parsed articles. And also
provide a simple interface to query the indexed articles.
--index Create, append and update index.
--search Do lucene search
Options:
--mode=<mode> Mode for create index, available choices are:
create_or_append, create, append
[default: create_or_append]
--query=<q> String to query.
--top=<n> Number of top results to show.
[default: 5]
-h --help Show help.
Examples:
1. Create index of all non-index documents
hoaxy lucene --index --mode=create_or_append
2. If you want to replace the old indexes and create a new one:
hoaxy lucene --index --mode=create
3. Search top 5 most relavant article containing keywords 'trump'
hoaxy lucene --search --query=trump
"""
name = 'lucene'
short_description = 'Lucene Indexing and Searching'
args_schema = Schema({
'--query':
Or(None, lambda s: len(s) > 0),
'--mode':
Or(
None,
And(
Use(str.lower),
lambda s: s in ('create_or_append', 'create', 'append'))),
'--top':
Or(None, And(Use(int), lambda x: x > 0)),
object:
object
})
@classmethod
def prepare_article(cls, article_data):
article_id, group_id, canonical_url, title, meta, content,\
date_published, domain, site_type = article_data
article = dict(
article_id=article_id,
group_id=group_id,
canonical_url=canonical_url,
title=title,
content=content,
date_published=date_published,
domain=domain,
site_type=site_type)
article['meta'] = str(meta)
article['uq_id_str'] = str(group_id) + title
if article['content'] is None:
article['content'] = 'NULL'
return article
@classmethod
def index(cls, session, mode, articles_iter, mgid):
lucene.initVM()
index_dir = cls.conf['lucene']['index_dir']
indexer = Indexer(
index_dir, mode, date_format=cls.conf['lucene']['date_format'])
article = None
article_group_ids = []
for i, data in enumerate(articles_iter):
article = cls.prepare_article(data)
indexer.index_one(article)
if i % cls.conf['window_size'] == 1:
logger.info('Indexed %s articles', i)
article_group_ids.append(article['group_id'])
indexer.close()
if article is not None:
q = """UPDATE article AS a
SET html=NULL
FROM UNNEST(:gids) AS t(group_id)
WHERE a.group_id=t.group_id
"""
session.execute(sqlalchemy.text(q), {"gids":article_group_ids})
session.commit()
mgid.value = str(article['group_id'])
session.commit()
logger.info('Indexed article pointer updated!')
else:
logger.warning('No new articles are found!')
logger.info('Done!')
@classmethod
def search(cls, query, n):
lucene.initVM()
index_dir = cls.conf['lucene']['index_dir']
searcher = Searcher(index_dir)
rs = searcher.search(query, n)
pprint.pprint(rs)
@classmethod
def run(cls, args):
try:
# print(args)
args = cls.args_schema.validate(args)
except SchemaError as e:
sys.exit(e)
session = Session()
# make sure lucene be inited
lucene.initVM()
lucene.getVMEnv().attachCurrentThread()
if args['--index'] is True:
configure_logging(
'lucene.index', console_level=args['--console-log-level'])
mgid = get_or_create_m(
session,
MetaInfo,
data=dict(
name='article_group_id_lucene_index',
value='0',
value_type='int',
description='article.group_id used for lucene index'),
fb_uk='name')
if args['--mode'] == 'create':
mgid.set_value(0)
session.commit()
logger.debug('Indexing started.. Getting articles..')
q = """
SELECT DISTINCT ON (a.group_id) a.id, a.group_id,
a.canonical_url,
a.title, a.meta, a.content,
coalesce(a.date_published, a.date_captured) AS pd,
s.domain, s.site_type
FROM article AS a
JOIN site AS s ON s.id=a.site_id
WHERE a.site_id IS NOT NULL AND s.is_enabled IS TRUE
AND a.group_id>:gid
ORDER BY group_id, pd ASC
"""
articles_iter = session.execute(
sqlalchemy.text(q).bindparams(gid=mgid.get_value()))
cls.index(session, args['--mode'], articles_iter, mgid)
elif args['--search'] is True:
configure_logging(
'lucene.search', console_level=args['--console-log-level'])
cls.search(args['--query'], args['--top'])
else:
print("Unrecognized command!")
sys.exit(2)