# Python Web Crawler  4 - Urllib

## Request

In [96]:
import urllib.request
from bs4 import BeautifulSoup

def getPage(url):
    page = urllib.request.urlopen(url) # <class 'http.client.HTTPResponse'>
    print(page.status)
    # print(page.getheaders())
    return page.read().decode('utf-8')
 
tree = BeautifulSoup(getPage("https://www.bing.com/"),"lxml")
tree.div.select('#bgDiv')  # JS rendered
# 200
# [<div data-minhdhor="" data-minhdver="" data-priority="0" id="bgDiv"></div>]

200


[<div data-minhdhor="" data-minhdver="" data-priority="0" id="bgDiv"></div>]

urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, context=None)
urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)

In [89]:
import socket
from urllib import request, parse,error

def getInfo(url, data="", headers={}, method="GET",timeout=1):
    dat = bytes(parse.urlencode(data), encoding='utf8')
    req = request.Request(url=url, data=dat, headers=headers, method=method) 
    req = request.urlopen(req, timeout=timeout)
    print(req.read().decode('utf-8'))

headers = {
    'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Host': 'httpbin.org'
}
dict = {
    'words1': 'you\'re a miracle' ,
    'words2':'what do you fear'
}
getInfo("http://httpbin.org/post",dict,headers,"POST",5)
# {
#   "args": {}, 
#   "data": "", 
#   "files": {}, 
#   "form": {
#     "words1": "you're a miracle", 
#     "words2": "what do you fear"
#   }, 
#   "headers": {
#     "Accept-Encoding": "identity", 
#     "Connection": "close", 
#     "Content-Length": "49", 
#     "Content-Type": "application/x-www-form-urlencoded", 
#     "Host": "httpbin.org", 
#     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
#   }, 
#   "json": null, 
#   "origin": "183.246.20.118", 
#   "url": "http://httpbin.org/post"
# }

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "words1": "you're a miracle", 
    "words2": "what do you fear"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Connection": "close", 
    "Content-Length": "49", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
  }, 
  "json": null, 
  "origin": "183.246.20.118", 
  "url": "http://httpbin.org/post"
}

{
  "args": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Connection": "close", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.6"
  }, 
  "origin": "183.246.20.118", 
  "url": "http://httpbin.org/get"
}



## ERROR

In [None]:
def getInfo(url, data="", headers={}, method="GET",timeout=1):
    try:
        dat = bytes(parse.urlencode(data), encoding='utf8')
        req = request.Request(url=url, data=dat, headers=headers, method=method) 
        req = request.urlopen(req, timeout=timeout)
        print(req.read().decode('utf-8'))
    except error.HTTPError as e:
        print(e.reason, e.code, e.headers, sep='\n')
    except error.URLError as e:
        if isinstance(e.reason, socket.timeout):
            print('TIME OUT')
    else:
        pass
getInfo('http://httpbin.org/index.htm')
# NOT FOUND
# 404
# Connection: close
# Server: meinheld/0.6.1
# Date: Sun, 11 Mar 2018 06:25:37 GMT
# Content-Type: text/html
# Content-Length: 233
# Access-Control-Allow-Origin: *
# Access-Control-Allow-Credentials: true
# X-Powered-By: Flask
# X-Processed-Time: 0
# Via: 1.1 vegur
getInfo('http://httpbin.org/get',timeout=.1)
# TIME OUT
getInfo('http://httpbin.org/get')
# {
#   "args": {}, 
#   "headers": {
#     "Accept-Encoding": "identity", 
#     "Connection": "close", 
#     "Content-Type": "application/x-www-form-urlencoded", 
#     "Host": "httpbin.org", 
#     "User-Agent": "Python-urllib/3.6"
#   }, 
#   "origin": "183.246.20.118", 
#   "url": "http://httpbin.org/get"
# }

## Parse
Parse module supports the following URL schemes: file, ftp, gopher, hdl, http, https, imap, mailto, mms, news, nntp, prospero, rsync, rtsp, rtspu, sftp, shttp, sip, sips, snews, svn, svn+ssh, telnet, wais, ws, wss.

### Split & Combine

In [119]:
from urllib.parse import urlparse as pr
from urllib.parse import urlunparse as upr

# scheme://netloc/path;parameters?query#fragment
result = pr('http://www.xiami.com/play?ids=/song/playlist/id/1/type/9#loadedt')
print(type(result), '\n',result)
# <class 'urllib.parse.ParseResult'> 
#  ParseResult(scheme='http', netloc='www.xiami.com', path='/play', \
#                      params='', query='ids=/song/playlist/id/1/type/9', fragment='loadedt')
[print(result[i]) for i in range(len(result))]
# http
# www.xiami.com
# /play

# ids=/song/playlist/id/1/type/9
# loaded

print( pr('www.xiami.com/play?ids=/song/playlist/id/1/type/9#loadedt',scheme="https"))
# ParseResult(scheme='https', netloc='', path='www.xiami.com/play',\
#                     params='', query='ids=/song/playlist/id/1/type/9', fragment='loadedt')

print( pr('https://www.xiami.com/play?ids=/song/playlist/id/1/type/9#loadedt',scheme="http",allow_fragments=False))
# ParseResult(scheme='https', netloc='www.xiami.com', path='/play', \
#             params='', query='ids=/song/playlist/id/1/type/9#loadedt', fragment='')

data = [result.scheme, result.netloc, result.path,result.params, result.query,result.fragment]
print(upr(data))
# http://www.xiami.com/play?ids=/song/playlist/id/1/type/9#loadedt

<class 'urllib.parse.ParseResult'> 
 ParseResult(scheme='http', netloc='www.xiami.com', path='/play', params='', query='ids=/song/playlist/id/1/type/9', fragment='loadedt')
http
www.xiami.com
/play

ids=/song/playlist/id/1/type/9
loadedt
ParseResult(scheme='https', netloc='', path='www.xiami.com/play', params='', query='ids=/song/playlist/id/1/type/9', fragment='loadedt')
ParseResult(scheme='https', netloc='www.xiami.com', path='/play', params='', query='ids=/song/playlist/id/1/type/9#loadedt', fragment='')
http://www.xiami.com/play?ids=/song/playlist/id/1/type/9#loadedt


In [121]:
from urllib.parse import urlsplit as sp
from urllib.parse import urlunsplit as usp

# # scheme://netloc/path?query#fragment
result = sp('http://www.xiami.com/play?ids=/song/playlist/id/1/type/9#loadedt')
print(type(result), '\n',result)
# <class 'urllib.parse.SplitResult'> 
#  SplitResult(scheme='http', netloc='www.xiami.com', path='/play', \
#              query='ids=/song/playlist/id/1/type/9', fragment='loadedt')
data = [result.scheme, result.netloc, result.path, result.query,result.fragment]
print(usp(data))
# http://www.xiami.com/play?ids=/song/playlist/id/1/type/9#loadedt

<class 'urllib.parse.SplitResult'> 
 SplitResult(scheme='http', netloc='www.xiami.com', path='/play', query='ids=/song/playlist/id/1/type/9', fragment='loadedt')
http://www.xiami.com/play?ids=/song/playlist/id/1/type/9#loadedt


In [None]:
### More

In [128]:
from urllib.parse import urljoin as jo
 
print(jo("http://www.xiami.com/","play?ids=/song/playlist/id/1/type/9#loadedt"))
print(jo("http://www.xiami.com/play?ids=/song/playlist/","play?ids=/song/playlist/id/1/type/9#loadedt"))
print(jo("http:","//www.xiami.com/play?ids=/song/playlist/id/1/type/9#loadedt"))
# http://www.xiami.com/play?ids=/song/playlist/id/1/type/9#loadedt

http://www.xiami.com/play?ids=/song/playlist/id/1/type/9#loadedt
http://www.xiami.com/play?ids=/song/playlist/id/1/type/9#loadedt
http://www.xiami.com/play?ids=/song/playlist/id/1/type/9#loadedt


In [145]:
from urllib.parse import urlencode,parse_qs,quote,unquote

params = {
    'tn':'baidu',
    'wd': 'google chrome',
}
base_url = 'http://www.baidu.com/s?'
base_url + urlencode(params)
# 'http://www.baidu.com/s?tn=baidu&wd=google+chrome'

print(parse_qs( urlencode(params)))
# {'tn': ['baidu'], 'wd': ['google chrome']}


'https://www.baidu.com/s?wd=' + quote("百度")
# 'https://www.baidu.com/s?wd=%E7%99%BE%E5%BA%A6'

url = 'https://www.baidu.com/s?wd=%E7%99%BE%E5%BA%A6'
print(unquote(url))
# https://www.baidu.com/s?wd=百度

{'tn': ['baidu'], 'wd': ['google chrome']}
https://www.baidu.com/s?wd=百度


## Handler

`BaseHandler`[¶](https://docs.python.org/3/library/urllib.request.html#urllib.request.BaseHandler)

- `HTTPDefaultErrorHandler`


- `HTTPRedirectHandler`


- `HTTPCookieProcessor`(*cookiejar=None*)


- `ProxyHandler`(*proxies=None*)


- `HTTPPasswordMgr`


- `HTTPPasswordMgrWithDefaultRealm`


- `HTTPPasswordMgrWithPriorAuth`
- ` ...`

### Cookies

In [68]:
import http.cookiejar, urllib.request
 
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response)
# <http.client.HTTPResponse object at 0x04D421F0>
for item in cookie:
    print(item.name+"="+item.value)
# BAIDUID=7A55D7DB4ECB570361D1D1186DD85275:FG=1
# ...

<http.client.HTTPResponse object at 0x054EBED0>
BAIDUID=7A55D7DB4ECB570361D1D1186DD85275:FG=1
BIDUPSID=7A55D7DB4ECB570361D1D1186DD85275
H_PS_PSSID=1425_25810_21117_17001_20719
PSTM=1520746720
BDSVRTM=0
BD_HOME=0


In [71]:
filename = 'cookies.txt'
cookie = http.cookiejar.LWPCookieJar(filename) # cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True, ignore_expires=True)
## LWP-Cookies-2.0
# Set-Cookie3: BAIDUID="990E47C14A144D813BB6629BEA0D1BEF:FG=1"; path="/"; domain=".baidu.com"; path_spec; domain_dot; expires="2086-03-29 08:56:02Z"; version=0
# ...

In [None]:
cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookies.txt', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))
# <!DOCTYPE html>
# <!--STATUS OK-->
# ...

### Password

In [None]:
from urllib.request import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler, build_opener
from urllib.error import URLError
 
username = 'username'
password = 'password'
url = 'url'
 
p = HTTPPasswordMgrWithDefaultRealm()
p.add_password(None, url, username, password)
auth_handler = HTTPBasicAuthHandler(p)
opener = build_opener(auth_handler)
 
try:
    result = opener.open(url)
    html = result.read().decode('utf-8')
    print(html)
except URLError as e:
    print(e.reason)

### Proxy

In [None]:
from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener
proxy_handler = ProxyHandler({
    'http': 'url',
    'https': 'url'
})
opener = build_opener(proxy_handler)
try:
    response = opener.open('https://www.baidu.com')
    print(response.read().decode('utf-8'))
except URLError as e:
    print(e.reason)

In [None]:
Using requests

In [None]:
import requests
 
proxies = {
    'http': 'url',
    'https': 'url'
}

# http://user:password@host:port

proxies = {
    "http": "http://user:password@10.10.1.10:3128/",
}

# socks

proxies = {
    'http': 'socks5://user:password@host:port',
    'https': 'socks5://user:password@host:port'
}

requests.get("https://www.baidu.com", proxies=proxies)

## Robots

### e.g. Robots.txt

https://www.taobao.com/robots.txt


In [None]:
User-agent:  Baiduspider
Allow:  /article
Allow:  /oshtml
Disallow:  /product/
Disallow:  /

User-Agent:  Googlebot
Allow:  /article
Allow:  /oshtml
Allow:  /product
Allow:  /spu
Allow:  /dianpu
Allow:  /oversea
Allow:  /list
Disallow:  /

User-agent:  Bingbot
Allow:  /article
Allow:  /oshtml
Allow:  /product
Allow:  /spu
Allow:  /dianpu
Allow:  /oversea
Allow:  /list
Disallow:  /

User-Agent:  360Spider
Allow:  /article
Allow:  /oshtml
Disallow:  /

User-Agent:  Yisouspider
Allow:  /article
Allow:  /oshtml
Disallow:  /

User-Agent:  Sogouspider
Allow:  /article
Allow:  /oshtml
Allow:  /product
Disallow:  /

User-Agent:  Yahoo!  Slurp
Allow:  /product
Allow:  /spu
Allow:  /dianpu
Allow:  /oversea
Allow:  /list
Disallow:  /

User-Agent:  *
Disallow:  /

### RobotFileParser

In [160]:
from urllib.robotparser import RobotFileParser
from urllib.request import urlopen

url = "http://httpbin.org/robots.txt "

rp = RobotFileParser(url)
rp.read()
print(rp.can_fetch('*', 'http://httpbin.org/deny'))
print(rp.can_fetch('*', "http://httpbin.org/image"))
# False
# True

False
True


## REFERENCES

In [None]:
- https://docs.python.org/3/library/urllib.html
- http://httpbin.org/