forked from jpatokal/mediawiki-gateway
/
gateway.rb
307 lines (249 loc) · 10.9 KB
/
gateway.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
require 'logger'
require 'rest_client'
require 'rexml/document'
require 'uri'
module MediaWiki
class Gateway
USER_AGENT = "#{self}/#{VERSION}"
META_TOKEN_TYPES = %w[
createaccount
csrf
deleteglobalaccount
login
patrol
rollback
setglobalaccountstatus
userrights
watch
]
class << self
attr_accessor :default_user_agent
end
# Set up a MediaWiki::Gateway for a given MediaWiki installation
#
# [url] Path to API of target MediaWiki (eg. 'http://en.wikipedia.org/w/api.php')
# [options] Hash of options
# [http_options] Hash of options for RestClient::Request (via http_send)
#
# Options:
# [:bot] When set to true, executes API queries with the bot parameter (see http://www.mediawiki.org/wiki/API:Edit#Parameters). Defaults to false.
# [:ignorewarnings] Log API warnings and invalid page titles, instead throwing MediaWiki::APIError
# [:limit] Maximum number of results returned per search (see http://www.mediawiki.org/wiki/API:Query_-_Lists#Limits), defaults to the MediaWiki default of 500.
# [:logdevice] Log device to use. Defaults to STDERR
# [:loglevel] Log level to use, defaults to Logger::WARN. Set to Logger::DEBUG to dump every request and response to the log.
# [:maxlag] Maximum allowed server lag (see http://www.mediawiki.org/wiki/Manual:Maxlag_parameter), defaults to 5 seconds.
# [:retry_count] Number of times to try before giving up if MediaWiki returns 503 Service Unavailable, defaults to 3 (original request plus two retries).
# [:retry_delay] Seconds to wait before retry if MediaWiki returns 503 Service Unavailable, defaults to 10 seconds.
# [:user_agent] User-Agent header to send with requests, defaults to ::default_user_agent or nil.
def initialize(url, options = {}, http_options = {})
@options = {
bot: false,
limit: 500,
logdevice: STDERR,
loglevel: Logger::WARN,
max_results: 500,
maxlag: 5,
retry_count: 3,
retry_delay: 10,
user_agent: self.class.default_user_agent
}.merge(options)
@log = Logger.new(@options[:logdevice])
@log.level = @options[:loglevel]
@http_options, @wiki_url, @cookies, @headers = http_options, url, {}, {
'User-Agent' => [@options[:user_agent], USER_AGENT].compact.join(' '),
'Accept-Encoding' => 'gzip'
}
end
attr_reader :log, :wiki_url, :cookies, :headers
# Make generic request to API
#
# [form_data] hash of attributes to post
# [continue_xpath] XPath selector for query continue parameter
#
# Returns XML document
def send_request(form_data, continue_xpath = nil)
make_api_request(form_data, continue_xpath).first
end
# Fetch token (type 'delete', 'edit', 'email', 'import', 'move', 'protect')
def get_token(type, page_titles = nil)
params = {
'action' => 'query',
'meta' => 'tokens'
}
if META_TOKEN_TYPES.include?(type)
params.merge!('type' => type)
end
res = send_request(params)
tokens = res.elements['query/tokens'].attributes
unless token = tokens['logintoken'] || tokens['csrftoken'] || tokens['token']
raise Unauthorized.new "User is not permitted to perform this operation: #{type}"
end
token
end
private
# Iterate over query results
#
# [list] list name to query
# [res_xpath] XPath selector for results
# [attr] attribute name to extract, if any
# [param] parameter name to continue query
# [options] additional query options
#
# Yields each attribute value, or, if +attr+ is nil, each REXML::Element.
def iterate_query(list, res_xpath, attr, param, options, &block)
items, block = [], lambda { |item| items << item } unless block
attribute_names = %w[from continue].map { |name|
"name()='#{param[0, 2]}#{name}'"
}
req_xpath = "//query-continue/#{list}/@*[#{attribute_names.join(' or ')}]"
res_xpath = "//query/#{list}/#{res_xpath}" unless res_xpath.start_with?('/')
options, continue = options.merge('action' => 'query', 'list' => list), nil
loop {
res, continue = make_api_request(options, req_xpath)
REXML::XPath.match(res, res_xpath).each { |element|
block[attr ? element.attributes[attr] : element]
}
continue ? options[param] = continue : break
}
items
end
# Make generic request to API
#
# [form_data] hash of attributes to post
# [continue_xpath] XPath selector for query continue parameter
# [retry_count] Counter for retries
#
# Returns array of XML document and query continue parameter.
def make_api_request(form_data, continue_xpath = nil, retry_count = 1)
# If this is our first try, then reset our warnings
@warnings = [] if retry_count == 1
if retry_count > @options[:retry_count]
raise MediaWiki::Exception.new("Retries exceeded: Terminating after #{retry_count - 1} retries")
end
form_data.update('format' => 'xml', 'maxlag' => @options[:maxlag])
# some actions require a second request with a token received on the first request
if META_TOKEN_TYPES.include?(action = form_data['action'])
if action == 'login'
form_data['lgtoken'] = get_token('login')
else
form_data['token'] = get_token(action)
end
end
http_send(@wiki_url, form_data, @headers.merge(cookies: @cookies)) do |response|
if response.code == 503
retry_delay = @options[:retry_delay]
if response.headers.has_key?(:retry_after)
retry_delay = [@options[:retry_delay], response.headers[:retry_after].to_i].max
end
# If it's a maxlag error, parse the maxlag message to get the
# maxlag, since on Wikipedia they don't pass the maxlag through the
# header.
match = response.body.match /Retry in (\d+) seconds/
if match
retry_delay = [retry_delay, match[1].to_i].max
end
warning = "503 Service Unavailable: #{response.body}. Retry in #{retry_delay} seconds."
warning += " headers.Retry-After=#{response.headers[:retry_after]}" if response.headers.has_key?(:retry_after)
@warnings.push(warning)
log.warn(warning)
sleep(retry_delay)
return make_api_request(form_data, continue_xpath, retry_count + 1)
end
# Check response for errors and return XML
unless response.code >= 200 && response.code < 300
raise MediaWiki::Exception.new("Bad response: #{response}")
end
# Parse the XML (raises exception if invalid XML)
response = MediaWiki::Response.new(response)
# Note: Although the documentation suggests that a maxlag error should
# return HTTP 503, in practice some wikis (such as en.wikipedia.org)
# actually return HTTP 200 with an XML (as opposed to text/plain)
# response.
if response.has_error? && response.error_code == 'maxlag'
retry_delay = @options[:retry_delay]
# For Wikipedia, it seems like this header value is always 5, so
# it's not useful.
if response.headers.has_key?(:retry_after)
retry_delay = [retry_delay, response.headers[:retry_after].to_i].max
end
# Parse the maxlag message to get the maxlag, since on Wikipedia
# they don't pass the maxlag through the header.
match = response.error_info.match /(\d+).*? seconds lagged/
if match
retry_delay = [retry_delay, match[1].to_i].max
end
warning = "maxlag exceeded: #{response.body}. Retry in #{retry_delay} seconds."
warning += " headers.Retry-After=#{response.headers[:retry_after]}" if response.headers.has_key?(:retry_after)
@warnings.push(warning)
log.debug(warning)
sleep(retry_delay)
return make_api_request(form_data, continue_xpath, retry_count + 1)
end
# Handle any errors or warnings that were included in the response
check_response(response)
doc = response.doc
log.debug("RES: #{doc}")
@cookies.update(response.cookies)
return [doc, (continue_xpath && doc.elements['query-continue']) ?
REXML::XPath.first(doc, continue_xpath) : nil]
end
end
# Execute the HTTP request using either GET or POST as appropriate.
# @yieldparam response
def http_send url, form_data, headers = {}, &block
# See https://github.com/rest-client/rest-client/issues/722
headers['Accept-Encoding'] = ''
opts = @http_options.merge(url: url, headers: headers, verify_ssl: false)
opts[:method] = form_data['action'] == 'query' ? :get : :post
opts[:method] == :get ? headers[:params] = form_data : opts[:payload] = form_data
log.debug("#{opts[:method].upcase}: #{form_data.inspect}, #{@cookies.inspect}")
RestClient::Request.execute(opts) do |response, request, result|
# When a block is passed to RestClient::Request.execute, we must
# manually handle response codes ourselves. If no block is passed,
# then redirects are automatically handled, but HTTP errors also
# result in exceptions being raised. For now, we manually check for
# HTTP 503 errors (see: #make_api_request), but we must also manually
# handle HTTP redirects.
if [301, 302, 307].include?(response.code) && request.method == :get
response = response.follow_redirection(request, result)
end
block.call(response)
end
end
# 1. Raises exception if repsonse is not a valid MediaWiki XML response
# 2. Raises exception if response contains errors
# 3. Prints warnings
def check_response(res)
unless %w[api mediawiki].include?(res.doc.name)
raise MediaWiki::Exception.new("Response does not contain Mediawiki API XML: #{res}")
end
if res.has_error?
raise APIError.new(*res.error.attributes.values_at(*%w[code info]))
end
if res.has_warnings?
warning("API warning: #{res.warnings.children.map(&:text).join(', ')}")
end
end
def validate_options(options, valid_options)
options.each_key { |opt|
unless valid_options.include?(opt.to_s)
raise ArgumentError, "Unknown option '#{opt}'", caller(1)
end
}
end
def valid_page?(page)
page && !page.attributes['missing'] && (!page.attributes['invalid'] ||
warning("Invalid title '#{page.attributes['title']}'"))
end
def warning(msg)
raise APIError.new('warning', msg) unless @options[:ignorewarnings]
log.warn(msg)
false
end
end
end
require_relative 'gateway/files'
require_relative 'gateway/pages'
require_relative 'gateway/query'
require_relative 'gateway/site'
require_relative 'gateway/users'