In [ ]:
import urllib.request
import urllib.parse
import socket
import traceback

In [ ]:
# Get your first web page ---------------------------------------------------------------
response = urllib.request.urlopen("https://www.python.org")
print("The status code of response is >>> ", response.status)
print("The header of response is >>> ", response.getheaders())
print("The server of response is >>> ", response.getheader("Server"))

In [ ]:
# Request a web page with argument "data" ------------------------------------------------
encode   = urllib.parse.urlencode({'Name':'Junkai'}) # Transform the dict to string
data     = bytes(encode, encoding = 'utf8')          # Transform the string to byte stream
response = urllib.request.urlopen("http://httpbin.org/post", data = data)
print("The encode is >>> ", encode, ">>> and the type of encode is >>> ", type(encode))
print("The data in utf8 format is >>> ", data, " >>> and the type of the data is >>> ", type(data))
print("The response body is >>> ", response.read().decode('utf-8'))

In [ ]:
# Request a web page with argument "timeout" --------------------------------------------
try:
    response = urllib.request.urlopen("http://httpbin.org/get", timeout = 0.1)
except urllib.error.URLError as err:
    # err.reason返回的不一定是字符串，可能是一个对象
    if isinstance(err.reason, socket.timeout):
        print("TIME OUT!!!")
# except Exception as exc:
    # print(traceback.format_exc())
    # print("An exception >>> ", exc)


In [ ]:
# Request a web page in a advanced way ------------------------------------------------
# 使用Request对象可以更多定义请求的内容和方式

request  = urllib.request.Request('https://www.python.org')
response = urllib.request.urlopen(request)
print("The response is >>> ", response.read().decode('utf-8')) # Every string can be specified decode type

In [ ]:
# Request a web page with all the arguments --------------------------------------------
url    = "http://httpbin.org/post"
header = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
    'Host': 'httpbin.org' 
}
form = {
    'name': 'HelloWorld'
}
data = bytes(urllib.parse.urlencode(form), encoding = 'utf-8')

# Method 1
# req  = urllib.request.Request(url = url, data = data, headers = header, method = 'POST')

# Method 2
req = urllib.request.Request(url = url, data = data, method = "POST")
req.add_header('User-Agent', "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)")

response = urllib.request.urlopen(req)
print("The response is >>> ", response.read().decode('utf-8')) # Every string can be specified decode type

In [ ]:
# Request a web page in a more advanced way -------------------------------------------------
# 在请求网页的时候，不仅仅是要传递表单或者设置请求header，现在有一个更加高级的配置
# 可以用于处理管理认证、代理设置、Cookie处理、重定向、错误异常处理等在请求过程的各种功能
# 这时候引入urllib下面的各种handler，来帮助我们实现以上功能

# >>>>>>>>>> 认证管理 <<<<<<<
from urllib.request import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler, build_opener
from urllib.error import URLError

url      = "http://localhost:5000/"
username = "username"
password = "password"

p = HTTPPasswordMgrWithDefaultRealm()
p.add_password(None, url, username, password)
auth_handler = HTTPBasicAuthHandler(p)
opener = build_opener(auth_handler)

try:
    result = opener.open(url)
    html = result.read().decode('utf-8')
    print(html)
except URLError as e:
    print(e.reason)

In [ ]:

# >>>>>>>>>> 代理设置 <<<<<<<<
from urllib.request import ProxyHandler, build_opener
from urllib.error import URLError

url = "https://www.baidu.com"

proxy_config = {
    "http": "http://127.0.0.1:9999",
    "https": "https://127.0.0.1:9999"
}
proxy_handler = ProxyHandler(proxy_config)
opener = build_opener(proxy_handler)

try:
    result = opener.open(url)
    html = result.read().decode('utf-8')
    print(html)
except URLError as e:
    print(e.reason)

In [ ]:

# >>>>>>>>>> Cookie处理 <<<<<<<<
from urllib.request import HTTPCookieProcessor, build_opener
import http.cookiejar

cookie  = http.cookiejar.CookieJar()

# 也可以将cookies保存成文件，不同CookieJar子类用于不用的cookies文件格式
# cookie  = http.cookiejar.MozillaCookieJar("MozillaCookies.txt")
# cookie  = http.cookiejar.LWPCookieJar("LWPCookies.txt")

# 如果已经有了cookies文件，也可以load进来
# cookie.load("cookies.txt", ignore_discard = True, ignore_expires = True)

handler = HTTPCookieProcessor(cookie)
opener  = build_opener(handler)

try:
    result = opener.open("https://www.baidu.com")
    for item in cookie:
        print(item.name + " = " + item.value)
except URLError as e:
    print(e.reason)

# 如果想要保存文件，需要去掉下一行注释
# cookie.save(ignore_discard = True, ignore_expires = True)


In [ ]:

# >>>>>>>>>> 异常处理 <<<<<<<<<<<<
from urllib import request, error

try:
    response = request.urlopen('https://talkchip.com/demo.html')
except error.HTTPError as err:
    print(err.reason, err.code, err.headers, sep = '\n')
except error.URLError as err:
    print(err.reason)
except Exception as err:
    print(traceback.format_exc())
else:
    print('Request Successfully!')

In [ ]:
# Using urllib to parse URL -------------------------------------------------------------
# 下面的部分是解析链接的各种例程

import urllib
from urllib.request import urlopen
from urllib.parse import urlparse
from urllib.parse import urlunparse
from urllib.parse import urlsplit
from urllib.parse import urlunsplit
from urllib.parse import urljoin

# One completed URL is scheme://netloc/path;params?query#fragment
url = 'http://www.baidu.com/index.html;user?id=5#comment'

def print_parse_result(result):
    
    if isinstance(result, urllib.parse.ParseResult):
        print("\nresult.scheme: ", result.scheme)
        print("result.netloc: ", result.netloc)
        print("result.path: ", result.path)
        print("result.params: ", result.params)
        print("result.query: ", result.query)
        print("result.fragment: ", result.fragment, "\n")

# >>>>>> 链接解析拆分 <<<<<<<<
result = urlparse(url)
print(type(result), result, sep = "\n")
print_parse_result(result)

result = urlparse(url, allow_fragments = False)
print(type(result), result, sep = "\n")
print_parse_result(result)

result = urlsplit(url) # 跟urlparse有一点区别
print(type(result), result, sep = "\n")
print_parse_result(result)

# >>>>>> 链接合并，严格要求参数数量 <<<<<<
data = ['http', 'www.baidu.com', "index.html", "user", "a=6", "comment"]
print(urlunparse(data))

data = ['http', 'www.baidu.com', 'index.html', 'a=6', 'comment']
print(urlunsplit(data))

# urljoin函数会解析第一个参数的scheme, netloc 和 path
# 然后对第二个参数进行缺失部分的补充，即缺失则补充，不缺失不替换
print(urljoin('http://www.baidu.com', 'FAQ.html'))

In [ ]:
from urllib.parse import urlencode
from urllib.parse import parse_qs
from urllib.parse import parse_qsl

# >>>>>>> 在字典中定义参数，然后拼接到url中作为GET请求的参数 <<<<<<
params = {
    'name': 'talkchip',
    'age': '2'
}
base_url = 'http://www.baidu.com?'
new_url  = base_url + urlencode(params)
print(new_url)

# >>>>>> 反向操作，将参数解析成字典和元组列表 <<<<<<<
query = 'name=talkchip&age=2'
print('parse_qs result >>> ', parse_qs(query))
print('parse_qsl result >>> ', parse_qsl(query))


In [ ]:
from urllib.parse import quote
from urllib.parse import unquote

# >>>>>> 将内容转化为URL编码及其反向操作 <<<<<<<
keyword = '铁观音'
url_url = 'https://www.baidu.com/s?wd=' + quote(keyword)
zh_url  = unquote(url_url)
print(url, zh_url, sep = '\n')

In [ ]:
# 关于robots.txt的一些操作，通过解析该文件，判断哪些网页可以爬取
from urllib.robotparser import RobotFileParser
from urllib import request, error

# Method 1 
rp = RobotFileParser('http://www.jianshu.com/robots.txt')
rp.read() # 执行读取和分析操作
print("Method 1 >>> ", rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))
print("Method 1 >>> ", rp.can_fetch('*', 'http://www.jianshu.com/search?q=python&page=1&type=collections'))

# Method 2
rp = RobotFileParser()
# rp.set_url('http://www.jianshu.com/robots.txt')
try:
    rp.parse(request.urlopen('http://wwww.taobao.com/robots.txt').read().decode('utf-8').split('\n'))
    print("Method 2 >>> ", rp.can_fetch('*', 'http://www.jianshu.com/p/b67554025d7d'))
    print("Method 2 >>> ", rp.can_fetch('*', 'http://www.jianshu.com/search?q=python&page=1&type=collections'))
except error.HTTPError as err:
    print(err.reason, err.code, err.headers, sep = '\n')
except error.URLError as err:
    print(err.reason)
except Exception as err:
    print(traceback.format_exc())
else:
    print('Request Successfully!')