Skip to content
Browse files

lots of fixes for requests breakage

  • Loading branch information...
1 parent 716574f commit a5a73bab78a45315346a77a813cced249dd104a2 @jamesturk jamesturk committed May 22, 2013
Showing with 41 additions and 23 deletions.
  1. +30 −12 scrapelib/__init__.py
  2. +2 −2 scrapelib/__main__.py
  3. +1 −1 scrapelib/cache.py
  4. +8 −8 scrapelib/tests/test_scraper.py
View
42 scrapelib/__init__.py
@@ -1,14 +1,11 @@
-import datetime
-import json
import logging
import os
import sys
import tempfile
import time
-import warnings
import requests
-from .cache import CachingSession, FileCache
+from .cache import CachingSession, FileCache # noqa
if sys.version_info[0] < 3: # pragma: no cover
from urllib2 import urlopen as urllib_urlopen
@@ -169,23 +166,37 @@ def request(self, method, url, **kwargs):
self.headers.get('User-Agent'))
# robots.txt is http-only
if (parsed_url.scheme in ('http', 'https') and
- self.follow_robots and
- not self._robot_allowed(user_agent, parsed_url)):
+ self.follow_robots and
+ not self._robot_allowed(user_agent, parsed_url)):
raise RobotExclusionError(
"User-Agent '%s' not allowed at '%s'" % (
user_agent, url), url, user_agent)
return super(RobotsTxtSession, self).request(method, url, **kwargs)
+# this object exists because Requests assumes it can call
+# resp.raw._original_response.msg.getheaders() and we need to cope with that
+class DummyObject(object):
+ def getheaders(self, name):
+ return ''
+
+ def get_all(self, name, default):
+ return default
+
+_dummy = DummyObject()
+_dummy._original_response = DummyObject()
+_dummy._original_response.msg = DummyObject()
+
+
class FTPAdapter(requests.adapters.BaseAdapter):
def send(self, request, stream=False, timeout=None, verify=False,
cert=None, proxies=None):
if request.method != 'GET':
raise HTTPMethodUnavailableError(
"FTP requests do not support method '%s'" % request.method,
- request.method)
+ request.method)
try:
real_resp = urllib_urlopen(request.url, timeout=timeout)
# we're going to fake a requests.Response with this
@@ -194,6 +205,7 @@ def send(self, request, stream=False, timeout=None, verify=False,
resp.url = request.url
resp.headers = {}
resp._content = real_resp.read()
+ resp.raw = _dummy
return resp
except urllib_URLError:
raise FTPError(request.url)
@@ -283,8 +295,7 @@ def __init__(self,
follow_robots=True,
retry_attempts=0,
retry_wait_seconds=5,
- header_func=None,
- ):
+ header_func=None):
super(Scraper, self).__init__()
self.mount('ftp://', FTPAdapter())
@@ -343,9 +354,16 @@ def request(self, method, url, **kwargs):
self._header_func(url))
else:
headers = {}
- headers = requests.sessions.merge_kwargs(headers, self.headers)
- headers = requests.sessions.merge_kwargs(kwargs.pop('headers', {}),
- headers)
+ try:
+ # requests < 1.2.2
+ headers = requests.sessions.merge_kwargs(headers, self.headers)
+ headers = requests.sessions.merge_kwargs(kwargs.pop('headers', {}),
+ headers)
+ except AttributeError:
+ # requests >= 1.2.2
+ headers = requests.sessions.merge_setting(headers, self.headers)
+ headers = requests.sessions.merge_setting(
+ kwargs.pop('headers', {}), headers)
return super(Scraper, self).request(method, url, timeout=timeout,
headers=headers, **kwargs)
View
4 scrapelib/__main__.py
@@ -33,15 +33,15 @@ def scrapeshell(): # pragma: no cover
args = parser.parse_args(orig_argv)
scraper = Scraper(follow_robots=args.robots)
- scraper.user_agent=args.user_agent
+ scraper.user_agent = args.user_agent
url = args.url
if args.postdata:
html = scraper.urlopen(args.url, 'POST', args.postdata)
else:
html = scraper.urlopen(args.url)
if USE_LXML:
- doc = lxml.html.fromstring(html.bytes)
+ doc = lxml.html.fromstring(html.bytes) # noqa
print('local variables')
print('---------------')
View
2 scrapelib/cache.py
@@ -125,7 +125,7 @@ def get(self, orig_key):
# need to split spaces out of status to get code (e.g. '200 OK')
resp.status_code = int(resp.headers.pop('status').split(' ')[0])
resp.encoding = resp.headers.pop('encoding')
- resp.url = resp.headers['content-location'] or orig_key
+ resp.url = resp.headers.get('content-location', orig_key)
#TODO: resp.request = request
return resp
except IOError:
View
16 scrapelib/tests/test_scraper.py
@@ -16,7 +16,7 @@
from .. import (Scraper, HTTPError, HTTPMethodUnavailableError,
RobotExclusionError, urllib_URLError, FTPError)
from .. import _user_agent as default_user_agent
-from ..cache import FileCache, MemoryCache
+from ..cache import MemoryCache
HTTPBIN = 'http://httpbin.org/'
@@ -44,8 +44,8 @@ def test_fields():
retry_attempts=-1, # will be 0
retry_wait_seconds=100)
assert s.requests_per_minute == 100
- assert s.follow_robots == False
- assert s.raise_errors == False
+ assert s.follow_robots is False
+ assert s.raise_errors is False
assert s.retry_attempts == 0 # -1 becomes 0
assert s.retry_wait_seconds == 100
@@ -64,7 +64,7 @@ def test_post():
resp_json = json.loads(resp)
assert_equal(resp_json['form']['woo'], 'woo')
assert_equal(resp_json['headers']['Content-Type'],
- 'application/x-www-form-urlencoded')
+ 'application/x-www-form-urlencoded')
def test_request_throttling():
@@ -109,7 +109,7 @@ def test_user_agent():
def test_user_agent_from_headers():
s = Scraper(requests_per_minute=0, follow_robots=False)
- s.headers = {'User-Agent':'from headers'}
+ s.headers = {'User-Agent': 'from headers'}
resp = s.urlopen(HTTPBIN + 'user-agent')
ua = json.loads(resp)['user-agent']
assert_equal(ua, 'from headers')
@@ -136,7 +136,8 @@ def test_follow_robots():
# turn off follow_robots, everything works
s.follow_robots = False
- assert_equal(200,
+ assert_equal(
+ 200,
s.urlopen("http://dummy/private/secret.html").response.code)
@@ -258,8 +259,7 @@ def test_timeout():
s.timeout = 0.001
s.follow_robots = False
with assert_raises(requests.Timeout):
- x = s.urlopen(HTTPBIN + 'delay/1')
-
+ s.urlopen(HTTPBIN + 'delay/1')
def test_timeout_retry():

0 comments on commit a5a73ba

Please sign in to comment.
Something went wrong with that request. Please try again.