In [ ]:
import urllib.request
import urllib.parse
import socket
import traceback

In [ ]:
# Get your first web page ---------------------------------------------------------------
response = urllib.request.urlopen("https://www.python.org")
print("The status code of response is >>> ", response.status)
print("The header of response is >>> ", response.getheaders())
print("The server of response is >>> ", response.getheader("Server"))

In [ ]:
# Request a web page with argument "data" ------------------------------------------------
encode   = urllib.parse.urlencode({'Name':'Junkai'}) # Transform the dict to string
data     = bytes(encode, encoding = 'utf8')          # Transform the string to byte stream
response = urllib.request.urlopen("http://httpbin.org/post", data = data)
print("The encode is >>> ", encode, ">>> and the type of encode is >>> ", type(encode))
print("The data in utf8 format is >>> ", data, " >>> and the type of the data is >>> ", type(data))
print("The response body is >>> ", response.read().decode('utf-8'))

In [ ]:
# Request a web page with argument "timeout" ---------------------------------------
try:
    response = urllib.request.urlopen("http://httpbin.org/get", timeout = 0.1)
except urllib.error.URLError as err:
    if isinstance(err.reason, socket.timeout):
        print("TIME OUT!!!")
# except Exception as exc:
    # print(traceback.format_exc())
    # print("An exception >>> ", exc)


In [ ]:
# Request a web page in a advanced way ------------------------------------------------
request  = urllib.request.Request('https://www.python.org')
response = urllib.request.urlopen(request)
print("The response is >>> ", response.read().decode('utf-8')) # Every string can be specified decode type

In [ ]:
# Request a web page with all the arguments --------------------------------------------
url    = "http://httpbin.org/post"
header = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
    'Host': 'httpbin.org' 
}
form = {
    'name': 'HelloWorld'
}
data = bytes(urllib.parse.urlencode(form), encoding = 'utf-8')

# Method 1
# req  = urllib.request.Request(url = url, data = data, headers = header, method = 'POST')

# Method 2
req = urllib.request.Request(url = url, data = data, method = "POST")
req.add_header('User-Agent', "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)")

response = urllib.request.urlopen(req)
print("The response is >>> ", response.read().decode('utf-8')) # Every string can be specified decode type

In [1]:
# Request a web page in a more advanced way -------------------------------------------------
# 在请求网页的时候，不仅仅是要传递表单或者设置请求header，现在有一个更加高级的配置
# 可以用于处理管理认证、代理设置、Cookie处理、重定向、错误异常处理等在请求过程的各种功能
# 这时候引入urllib下面的各种handler，来帮助我们实现以上功能

# >>>>>>>>>> 认证管理 <<<<<<<
from urllib.request import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler, build_opener
from urllib.error import URLError

url      = "http://localhost:5000/"
username = "username"
password = "password"

p = HTTPPasswordMgrWithDefaultRealm()
p.add_password(None, url, username, password)
auth_handler = HTTPBasicAuthHandler(p)
opener = build_opener(auth_handler)

try:
    result = opener.open(url)
    html = result.read().decode('utf-8')
    print(html)
except URLError as e:
    print(e.reason)

[WinError 10061] 由于目标计算机积极拒绝，无法连接。


In [2]:

# >>>>>>>>>> 代理设置 <<<<<<<<
from urllib.request import ProxyHandler, build_opener
from urllib.error import URLError

url = "https://www.baidu.com"

proxy_config = {
    "http": "http://127.0.0.1:9999",
    "https": "https://127.0.0.1:9999"
}
proxy_handler = ProxyHandler(proxy_config)
opener = build_opener(proxy_handler)

try:
    result = opener.open(url)
    html = result.read().decode('utf-8')
    print(html)
except URLError as e:
    print(e.reason)

In [3]:

# >>>>>>>>>> Cookie处理 <<<<<<<<
from urllib.request import build_opener
import urllib.request
import http.cookiejar

cookie  = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener  = build_opener(handler)

try:
    result = opener.open("https://www.baidu.com")
    for item in cookie:
        print(item.name + " = " + item.value)
except URLError as e:
    print(e.reason)



BAIDUID = AB458A2CD0FE60050169C3EBDF9ED36B:FG=1
BIDUPSID = AB458A2CD0FE6005EB4C3E5413BE02F5
PSTM = 1575738590
BD_NOT_HTTPS = 1
