Permalink
Browse files

all tests pass with requests 1.0

  • Loading branch information...
1 parent 078befb commit 315f02dc75515894f08f2ca1e0c20dfc2cbd0119 @jamesturk jamesturk committed Dec 21, 2012
Showing with 57 additions and 52 deletions.
  1. +45 −25 scrapelib/__init__.py
  2. +5 −19 scrapelib/cache.py
  3. +2 −3 scrapelib/tests/test_cache.py
  4. +5 −5 scrapelib/tests/test_scraper.py
View
70 scrapelib/__init__.py
@@ -154,8 +154,8 @@ def request(self, method, url, **kwargs):
class RobotsTxtSession(requests.Session):
- def __init__(self, *args, **kwargs):
- super(RobotsTxtSession, self).__init__(*args, **kwargs)
+ def __init__(self):
+ super(RobotsTxtSession, self).__init__()
self._robot_parsers = {}
def _robot_allowed(self, user_agent, parsed_url):
@@ -177,8 +177,8 @@ def _robot_allowed(self, user_agent, parsed_url):
def request(self, method, url, **kwargs):
parsed_url = urlparse.urlparse(url)
- user_agent = (kwargs.get('headers', {}).get('user-agent') or
- self.headers.get('user-agent'))
+ user_agent = (kwargs.get('headers', {}).get('User-Agent') or
+ self.headers.get('User-Agent'))
# robots.txt is http-only
if (parsed_url.scheme in ('http', 'https') and
self.config.get('obey_robots_txt', False) and
@@ -191,8 +191,6 @@ def request(self, method, url, **kwargs):
class FTPSession(requests.Session):
- # HACK: add FTP to allowed schemas
- requests.defaults.SCHEMAS.append('ftp')
def request(self, method, url, **kwargs):
if url.startswith('ftp://'):
@@ -297,7 +295,7 @@ def __init__(self,
hooks=None,
params=None,
config=None,
- prefetch=False,
+ prefetch=None,
verify=True,
cert=None,
# scrapelib-specific params
@@ -327,16 +325,46 @@ def __init__(self,
else:
self._header_func = None
- super(Scraper, self).__init__(headers, cookies, auth, timeout, proxies,
- hooks, params, config, prefetch, verify,
- cert, cache_storage=cache_obj)
+ super(Scraper, self).__init__()
+
+ # not attribute from requests
+ self.cache_storage = cache_obj
+ self.timeout = timeout
+
+ if headers is not None:
+ self.headers = headers
+ if cookies is not None:
+ self.cookies = cookies
+ if auth is not None:
+ self.auth = auth
+ if timeout:
+ self.timeout = timeout
+ if proxies is not None:
+ self.proxies = proxies
+ if hooks is not None:
+ self.hooks = hooks
+ if params is not None:
+ self.params = params
+ if config is not None:
+ # error out immediately
+ pass
+ if prefetch is not None:
+ # error out immediately
+ pass
+ if verify is not True:
+ self.verify = verify
+ if cert is not None:
+ self.cert = cert
+
+ # add config
+ self.config = {}
# scrapelib-specific settings
self.raise_errors = raise_errors
self.follow_redirects = follow_redirects
self.requests_per_minute = requests_per_minute
# properties (pass through to config/headers)
- if user_agent != _user_agent or 'user-agent' not in self.headers:
+ if not headers or 'User-Agent' not in headers:
self.user_agent = user_agent
self.follow_robots = follow_robots
self.retry_attempts = retry_attempts
@@ -360,11 +388,11 @@ def __init__(self,
@property
def user_agent(self):
- return self.headers['user-agent']
+ return self.headers['User-Agent']
@user_agent.setter
def user_agent(self, value):
- self.headers['user-agent'] = value
+ self.headers['User-Agent'] = value
@property
def follow_robots(self):
@@ -391,25 +419,17 @@ def retry_wait_seconds(self, value):
self.config['retry_wait_seconds'] = value
@property
- def cache_write_only(self):
- return self.config['cache_write_only']
-
- @cache_write_only.setter
- def cache_write_only(self, value):
- self.config['cache_write_only'] = value
-
- @property
def disable_compression(self):
- return self.headers['accept-encoding'] == 'text/*'
+ return self.headers['Accept-Encoding'] == 'text/*'
@disable_compression.setter
def disable_compression(self, value):
# disabled: set encoding to text/*
if value:
- self.headers['accept-encoding'] = 'text/*'
+ self.headers['Accept-Encoding'] = 'text/*'
# enabled: if set to text/* pop, otherwise leave unmodified
- elif self.headers.get('accept-encoding') == 'text/*':
- self.headers.pop('accept-encoding', None)
+ elif self.headers.get('Accept-Encoding') == 'text/*':
+ self.headers['Accept-Encoding'] = 'gzip, deflate, compress'
def urlopen(self, url, method='GET', body=None, retry_on_404=False):
"""
View
24 scrapelib/cache.py
@@ -11,24 +11,10 @@
class CachingSession(requests.Session):
- def __init__(self,
- headers=None,
- cookies=None,
- auth=None,
- timeout=None,
- proxies=None,
- hooks=None,
- params=None,
- config=None,
- prefetch=False,
- verify=True,
- cert=None,
- cache_storage=None,
- ):
- super(CachingSession, self).__init__(headers, cookies, auth, timeout,
- proxies, hooks, params, config,
- prefetch, verify, cert)
+ def __init__(self, cache_storage=None):
+ super(CachingSession, self).__init__()
self.cache_storage = cache_storage
+ self.cache_write_only = False
def key_for_request(self, method, url, **kwargs):
""" Return a cache key from a given set of request parameters.
@@ -42,7 +28,7 @@ def key_for_request(self, method, url, **kwargs):
return None
return requests.Request(url=url,
- params=kwargs.get('params', {})).full_url
+ params=kwargs.get('params', {})).prepare().url
def should_cache_response(self, response):
""" Check if a given Response object should be cached.
@@ -69,7 +55,7 @@ def request(self, method, url, **kwargs):
request_key = self.key_for_request(method, url, **kwargs)
- if request_key and not self.config.get('cache_write_only'):
+ if request_key and not self.cache_write_only:
resp = self.cache_storage.get(request_key)
if resp:
View
5 scrapelib/tests/test_cache.py
@@ -63,9 +63,8 @@ def test_simple_cache_request():
def test_cache_write_only():
- cs = CachingSession(cache_storage=MemoryCache(),
- config={'cache_write_only': True}
- )
+ cs = CachingSession(cache_storage=MemoryCache())
+ cs.cache_write_only = True
url = HTTPBIN + 'get'
# first response not from cache
View
10 scrapelib/tests/test_scraper.py
@@ -122,7 +122,7 @@ def test_user_agent():
def test_user_agent_from_headers():
s = Scraper(requests_per_minute=0, follow_robots=False,
- headers={'user-agent':'from headers'})
+ headers={'User-Agent':'from headers'})
resp = s.urlopen(HTTPBIN + 'user-agent')
ua = json.loads(resp)['user-agent']
assert_equal(ua, 'from headers')
@@ -192,10 +192,10 @@ def test_follow_redirect():
def test_caching():
cache_dir = tempfile.mkdtemp()
- s = Scraper(requests_per_minute=0, follow_robots=False,
- cache_obj=FileCache(cache_dir), cache_write_only=False)
#s = Scraper(requests_per_minute=0, follow_robots=False,
- # cache_obj=MemoryCache(), cache_write_only=False)
+ # cache_obj=FileCache(cache_dir), cache_write_only=False)
+ s = Scraper(requests_per_minute=0, follow_robots=False,
+ cache_obj=MemoryCache(), cache_write_only=False)
resp = s.urlopen(HTTPBIN + 'status/200')
assert not resp.response.fromcache
@@ -334,7 +334,7 @@ def test_disable_compression():
# A supplied Accept-Encoding headers overrides the
# disable_compression option
- s.headers['accept-encoding'] = '*'
+ s.headers['Accept-Encoding'] = '*'
data = s.urlopen(HTTPBIN + 'headers')
assert_equal(json.loads(data)['headers']['Accept-Encoding'], '*')

0 comments on commit 315f02d

Please sign in to comment.