This repository has been archived by the owner on Jan 12, 2022. It is now read-only.
/
request_rewriter.py
executable file
·377 lines (302 loc) · 13.5 KB
/
request_rewriter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
#!/usr/bin/env python
#
# Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Provides WSGI middleware for rewriting HTTP responses from the runtime.
The rewriting is used for various sanitisation and processing of the response
from the user's application, including:
- Removing disallowed HTTP response headers.
- Setting several response headers to their correct values (e.g.,
Content-Length).
- Rewriting responses with X-AppEngine-BlobKey with the full blob download.
- Rewriting fatal response errors (such as a response that is too large) with
a 500 error page.
The rewriter is runtime-agnostic. It can be applied to any WSGI application
representing an App Engine runtime.
"""
import calendar
import cStringIO
import email
import functools
import logging
import time
import wsgiref.headers
from google.appengine.tools.devappserver2 import blob_download
from google.appengine.tools.devappserver2 import constants
def _ignore_request_headers_rewriter(environ):
"""Ignore specific request headers.
Certain request headers should not be sent to the application. This function
removes those headers from the environment. For a complete list of these
headers please see:
https://developers.google.com/appengine/docs/python/runtime#Request_Headers
Args:
environ: An environ dict for the current request as defined in PEP-333.
"""
for h in constants.IGNORED_REQUEST_HEADERS:
h = 'HTTP_' + h.replace('-', '_').upper()
try:
del environ[h]
except KeyError:
pass
# A list of functions that take an environ and possibly modify it. The functions
# are applied to the request in order.
_REQUEST_REWRITER_CHAIN = [
_ignore_request_headers_rewriter,
]
class RewriterState(object):
"""The state of a WSGI response while it is being processed.
Instances of this class hold various attributes that make it easier to pass
data from one rewriter to another.
A rewriter is a function that takes a RewriterState as an argument, and
possibly modifies it.
Attributes:
environ: An environ dict for the current request as defined in PEP-333.
status: A status code and message as a string. (e.g., '200 OK'.)
headers: A wsgiref.headers.Headers containing the response headers.
body: An iterable of strings containing the response body.
allow_large_response: A Boolean value. If True, there is no limit to the
size of the response body. Defaults to False.
"""
def __init__(self, environ, status, headers, body):
"""Create a new RewriterState.
Args:
environ: An environ dict for the current request as defined in PEP-333.
status: A status code and message as a string. (e.g., '200 OK'.)
headers: A list of tuples containing the response headers.
body: An iterable of strings containing the response body.
"""
self.environ = environ
self.status = status
self.headers = wsgiref.headers.Headers(headers)
self.body = body
self.allow_large_response = False
@property
def status_code(self):
"""The integer value of the response status."""
return int(self.status.split(' ', 1)[0])
# Header names may be any printable ASCII character other than ':' and space.
# RFC 2616 prohibits other separator characters, but this is consistent with
# HTTPProto::IsValidHeader.
ALLOWED_HEADER_NAME_CHARACTERS = (frozenset([chr(c) for c in range(33, 128)]) -
frozenset([':']))
# Header values may be any printable ASCII character.
ALLOWED_HEADER_VALUE_CHARACTERS = frozenset([chr(c) for c in range(32, 128)])
def _ignore_response_headers_rewriter(ignored_response_headers, state):
"""Ignore specific response headers.
Certain response headers cannot be modified by an application. For a complete
list of these headers please see:
https://developers.google.com/appengine/docs/python/runtime#Responses
This rewriter simply removes those headers. It also removes non-printable
ASCII characters and non-ASCII characters, which are disallowed according to
RFC 2616.
Args:
ignored_response_headers: A list of header names to remove.
state: A RewriterState to modify.
"""
for name, value in state.headers.items():
if name.lower() in ignored_response_headers:
del state.headers[name]
# Delete a header if its name or value contains non-allowed characters.
try:
if isinstance(name, unicode):
name = name.encode('ascii')
if isinstance(value, unicode):
value = value.encode('ascii')
except UnicodeEncodeError:
# Contains non-ASCII Unicode characters.
del state.headers[name]
if (set(name) - ALLOWED_HEADER_NAME_CHARACTERS or
set(value) - ALLOWED_HEADER_VALUE_CHARACTERS):
del state.headers[name]
def _default_content_type_rewriter(state):
"""Set the default Content-Type header.
Args:
state: A RewriterState to modify.
"""
if not 'Content-Type' in state.headers:
state.headers['Content-Type'] = 'text/html'
def _cache_rewriter(state):
"""Set the default Cache-Control and Expires headers, and sanitize them.
The default values are only set if the response status allows a body, and only
if the headers have not been explicitly set by the application.
If the Set-Cookie response header is set, sanitizes the Cache-Control and
Expires headers to avoid public caching.
Args:
state: A RewriterState to modify.
"""
# If the response is cacheable, we need to be concerned about the
# Cache-Control and Expires headers.
if state.status_code in constants.NO_BODY_RESPONSE_STATUSES:
return
if not 'Cache-Control' in state.headers:
state.headers['Cache-Control'] = 'no-cache'
if not 'Expires' in state.headers:
state.headers['Expires'] = 'Fri, 01 Jan 1990 00:00:00 GMT'
if 'Set-Cookie' in state.headers:
# It is a security risk to have any caching with Set-Cookie.
# If Expires is omitted or set to a future date, and response code is
# cacheable, set Expires to the current date.
current_date = time.time()
expires = state.headers.get('Expires')
reset_expires = True
if expires:
expires_time = email.Utils.parsedate(expires)
if expires_time:
reset_expires = calendar.timegm(expires_time) >= current_date
if reset_expires:
state.headers['Expires'] = time.strftime('%a, %d %b %Y %H:%M:%S GMT',
time.gmtime(current_date))
# Remove 'public' cache-control directive, and add 'private' if it (or a
# more restrictive directive) is not already present.
cache_directives = []
for header in state.headers.get_all('Cache-Control'):
cache_directives.extend(v.strip() for v in header.split(','))
cache_directives = [d for d in cache_directives if d != 'public']
if not constants.NON_PUBLIC_CACHE_CONTROLS.intersection(cache_directives):
cache_directives.append('private')
state.headers['Cache-Control'] = ', '.join(cache_directives)
def _content_length_rewriter(state):
"""Rewrite the Content-Length header.
Even though Content-Length is not a user modifiable header, App Engine
sends a correct Content-Length to the user based on the actual response.
If the response status code indicates that the response is not allowed to
contain a body, the body will be deleted instead. If the response body is
longer than the maximum response length, the response will be turned into a
500 Internal Server Error.
Args:
state: A RewriterState to modify.
"""
# Convert the body into a list of strings, to allow it to be traversed more
# than once. This is the only way to get the Content-Length before streaming
# the output.
state.body = list(state.body)
length = sum(len(block) for block in state.body)
if state.status_code in constants.NO_BODY_RESPONSE_STATUSES:
# Delete the body and Content-Length response header.
state.body = []
del state.headers['Content-Length']
elif state.environ.get('REQUEST_METHOD') == 'HEAD':
if length:
# Delete the body, but preserve the Content-Length response header.
logging.warning('Dropping unexpected body in response to HEAD request')
state.body = []
else:
if (not state.allow_large_response and
length > constants.MAX_RUNTIME_RESPONSE_SIZE):
# NOTE: This response is too small to be visible in IE, as it replaces any
# error page with <512 bytes with its own.
# http://en.wikipedia.org/wiki/HTTP_404#Custom_error_pages
logging.error('Response too large: %d, max is %d',
length, constants.MAX_RUNTIME_RESPONSE_SIZE)
new_response = ('HTTP response was too large: %d. The limit is: %d.\n' %
(length, constants.MAX_RUNTIME_RESPONSE_SIZE))
state.status = '500 Internal Server Error'
state.headers['Content-Type'] = 'text/html'
state.headers['Content-Length'] = str(len(new_response))
state.body = [new_response]
else:
state.headers['Content-Length'] = str(length)
# A list of functions that take a RewriterState and possibly modify it. The
# functions are applied to the response in order.
_FRONTEND_RESPONSE_REWRITER_CHAIN = [
blob_download.blobstore_download_rewriter,
functools.partial(_ignore_response_headers_rewriter,
constants.FRONTEND_IGNORED_RESPONSE_HEADERS),
_default_content_type_rewriter,
_cache_rewriter,
_content_length_rewriter,
]
_RUNTIME_RESPONSE_REWRITER_CHAIN = [
_content_length_rewriter,
functools.partial(_ignore_response_headers_rewriter,
constants.RUNTIME_IGNORED_RESPONSE_HEADERS),
]
def _rewriter_middleware(request_rewriter_chain, response_rewriter_chain,
application, environ, start_response):
"""Wraps an application and applies a chain of rewriters to its response.
This first applies each function in request_rewriter_chain to the environ. It
then executes the application, and applies each function in
response_rewriter_chain to the response.
Args:
request_rewriter_chain: A chain of functions to apply to the environ.
response_rewriter_chain: A chain of functions to apply to the response.
application: The WSGI application to wrap as defined in PEP-333.
environ: An environ dict for the current request as defined in PEP-333.
start_response: A function with semantics defined in PEP-333.
Returns:
An iterable of strings containing the body of an HTTP response.
"""
response_dict = {'headers_sent': False}
write_body = cStringIO.StringIO()
def wrapped_start_response(status, response_headers, exc_info=None):
if exc_info and response_dict['headers_sent']:
# Headers have already been sent. PEP 333 mandates that this is an error.
raise exc_info[0], exc_info[1], exc_info[2]
response_dict['status'] = status
response_dict['response_headers'] = response_headers
return write_body.write
for rewriter in request_rewriter_chain:
rewriter(environ)
response_body = iter(application(environ, wrapped_start_response))
# Get the first non-empty string from the application's response. This ensures
# that the application has called wrapped_start_response, and allows us to
# treat future calls to wrapped_start_response as errors.
first = write_body.getvalue()
while not first:
try:
first = response_body.next()
except StopIteration:
break
# A conformant application must have called wrapped_start_response by this
# point, and should not call it again unless there is an unrecoverable error.
response_dict['headers_sent'] = True
try:
status = response_dict['status']
response_headers = response_dict['response_headers']
except KeyError:
raise AssertionError('Application yielded before calling start_response.')
# Prepend first onto response_body.
def reconstructed_body():
yield first
for string in response_body:
yield string
body = reconstructed_body()
state = RewriterState(environ, status, response_headers, body)
for rewriter in response_rewriter_chain:
rewriter(state)
start_response(state.status, state.headers.items())
return state.body
def frontend_rewriter_middleware(application):
"""WSGI middleware application that applies a chain of response rewriters.
Args:
application: The WSGI application to wrap as defined in PEP-333.
Returns:
A WSGI application that applies the rewriter chain to the inner application.
"""
return functools.partial(_rewriter_middleware,
_REQUEST_REWRITER_CHAIN,
_FRONTEND_RESPONSE_REWRITER_CHAIN,
application)
def runtime_rewriter_middleware(application):
"""WSGI middleware application that applies a chain of response rewriters.
Args:
application: The WSGI application to wrap as defined in PEP-333.
Returns:
A WSGI application that applies the rewriter chain to the inner application.
"""
return functools.partial(_rewriter_middleware,
_REQUEST_REWRITER_CHAIN,
_RUNTIME_RESPONSE_REWRITER_CHAIN,
application)