## Python Web Crawler  5 - Requests

## Quickstart

In [12]:
import requests as R

print(R.get('http://httpbin.org/get').text)
# {
#   "args": {}, 
#   "headers": {
#     "Accept": "*/*", 
#     "Accept-Encoding": "gzip, deflate", 
#     "Connection": "close", 
#     "Host": "httpbin.org", 
#     "User-Agent": "python-requests/2.18.4"
#   }, 
#   "origin": "115.196.156.112", 
#   "url": "http://httpbin.org/get"
# }
print(R.post('http://httpbin.org/post').text)
# {
#   "args": {}, 
#   "data": "", 
#   "files": {}, 
#   "form": {}, 
#   "headers": {
#     "Accept": "*/*", 
#     "Accept-Encoding": "gzip, deflate", 
#     "Connection": "close", 
#     "Content-Length": "0", 
#     "Host": "httpbin.org", 
#     "User-Agent": "python-requests/2.18.4"
#   }, 
#   "json": null, 
#   "origin": "115.196.156.112", 
#   "url": "http://httpbin.org/post"
# }

req = R.get('https://www.bing.com/search?q=bing',timeout = 5)
print(req.status_code)
# 200
print(req.text.encode("utf-8"))
#  b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 ...
print(req.headers['content-type'])
# text/html; charset=utf-8
print(req.url)
# https://www.bing.com/search?q=bing
print(req.history)
# [] 
for key, value in req.cookies.items():   # RequestCookieJar
    print(key + '=' + value)
# MUID=05319260E985628908A499CFE832631F
# SRCHD=AF=NOFORM
# ...

{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.18.4"
  }, 
  "origin": "115.196.158.72", 
  "url": "http://httpbin.org/get"
}

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Content-Length": "0", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.18.4"
  }, 
  "json": null, 
  "origin": "115.196.158.72", 
  "url": "http://httpbin.org/post"
}

200
text/html; charset=utf-8
https://www.bing.com/search?q=bing
[]
MUID=1FFD443EF62A63021DBB4F8FF79D62E2
SRCHD=AF=NOFORM
SRCHUID=V=2&GUID=6CBA27EC7B824AD28534C124C997BABD&dmnchg=1
SRCHUSR=DOB=20180313
_EDGE_S=F=1&SID=3D9DA2631EF0671E2191A9D21F476611
_EDGE_V=1
_SS=SID=3D9DA2631EF0671E2191A9D21F476611
MUIDB=1FFD443EF62A63021DBB4F8FF79D62E2


In [None]:
song = {
    'song1': 'Into the Well' ,
    'song2':'Lift Me Up'
}
print(R.get("http://httpbin.org/get", params=song))
# <Response [200]>
print(R.get("http://httpbin.org/get", params=song).text) # str
# <Response [200]>
# {
#   "args": {
#     "song1": "Into the Well", 
#     "song2": "Lift Me Up"
#   }, 
#   "headers": {
#     "Accept": "*/*", 
#     "Accept-Encoding": "gzip, deflate", 
#     "Connection": "close", 
#     "Host": "httpbin.org", 
#     "User-Agent": "python-requests/2.18.4"
#   }, 
#   "origin": "115.196.156.112", 
#   "url": "http://httpbin.org/get?song1=Into+the+Well&song2=Lift+Me+Up"
# }
print(R.post("http://httpbin.org/post", params=song,data=song).text) # str
# {
#   "args": {
#     "song1": "Into the Well", 
#     "song2": "Lift Me Up"
#   }, 
#   "data": "", 
#   "files": {}, 
#   "form": {
#     "song1": "Into the Well", 
#     "song2": "Lift Me Up"
#   }, 
#   "headers": {
#     "Accept": "*/*", 
#     "Accept-Encoding": "gzip, deflate", 
#     "Connection": "close", 
#     "Content-Length": "36", 
#     "Content-Type": "application/x-www-form-urlencoded", 
#     "Host": "httpbin.org", 
#     "User-Agent": "python-requests/2.18.4"
#   }, 
#   "json": null, 
#   "origin": "115.196.156.112", 
#   "url": "http://httpbin.org/post?song1=Into+the+Well&song2=Lift+Me+Up"
# }
print(R.get('http://httpbin.org/get?song1=Into+the+Well&song2=Lift+Me+Up2').json())  # dict
# {'args': {'song1': 'Into the Well', 'song2': 'Lift Me Up2'}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate',\
# 'Connection': 'close', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.18.4'},\
# 'origin': '115.196.156.112', 'url': 'http://httpbin.org/get?song1=Into+the+Well&song2=Lift+Me+Up2'}

## Grab data

### Cookies

In [2]:
import re
 
headers = {
    'Cookie': '_zap=400b2450-a8d3-4e2c-9157-3961f50262c3; d_c0="AFACfRAuQgyPTsGYfK3nqLA1fGlZo6FhSDA=|1503379690"; q_c1=3b17d61410ee44acaf64653951a689c5|1508817203000|1503407076000; __utma=51854390.305172480.1512467100.1512467100.1512467100.1; __utmz=51854390.1512467100.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=51854390.100--|2=registration_date=20170420=1^3=entry_date=20170420=1; z_c0="2|1:0|10:1516533477|4:z_c0|92:Mi4xOVNXX0JBQUFBQUFBVUFKOUVDNUNEQ1lBQUFCZ0FsVk41Y1JSV3dCVTMtVEVTdXRzNVNqeXBTM1F2T3BTeXJWYl9R|5bb9207c1f2f31b36435d16cdd9ed3d98a085ffe20ce1cdae142bc2f576e0317"; q_c1=3b17d61410ee44acaf64653951a689c5|1520309043000|1503407076000; aliyungf_tc=AQAAAAa7ynFbXAUAcJzEcwalmjCy9Dhx; _xsrf=d4b95018-fa55-4f74-9f04-da741c27e36b',
    'Host': 'www.zhihu.com',
    'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 YaBrowser/18.2.0.284Yowser/2.5 Safari/537.36'
}
req = R.get("https://www.zhihu.com/explore", headers=headers)
exit() if not req.status_code == R.codes.ok else print('Request Successfully')
pattern = re.compile('<a.*?question_link.*?>(.*?)</a>', re.S)
titles = re.findall(pattern, req.text)
print(titles)
pattern2 = re.compile('<a.*?zu-top-nav-userinfo.*?<span.*?>(.*?)</span>.*?Avatar', re.S)
username = re.findall(pattern2, req.text)
print(username)

Request Successfully
['\n你青春期是怎么长高的？有哪些好的方法？都经历了哪些过程？\n', '\n如何评价 2018 年普利兹克建筑奖颁给巴克里希纳·多西？\n', '\n如何看待美国心理学会拿出资金支持高中心理老师区域网络的建立、维护？\n', '\n你听过管理者最无耻的一句话是什么？\n', '\n如何看待美团开源的 mpvue ?\n', '\nDota2有哪些观赏性特别高的比赛？\n', '\n有哪些惊艳到你的句子？\n', '\n动漫里有哪些令人敬佩的女性角色，可以用视频分享吗？\n', '\n有哪些眼前一亮的暗中称妙的成语？\n', '\n为了高考你有多拼命？\n', '\n中国有哪些值得拍成影视剧的历史或人物？\n', '\n乌克兰是不是真的很穷？\n', '\n在p社游戏过程中，有哪些让你对现实产生思考的时候？\n', '\n当当网是如何走到被收购这一步的？\n']
['Lyole']


In [110]:
cookies = headers['Cookie']
jar = R.cookies.RequestsCookieJar()
headers2 = {
    'Host': 'www.zhihu.com',
    'User-Agent': headers["User-Agent"]
}  
for cookie in cookies.split(';'):
    key, value = cookie.split('=', 1)
    jar.set(key, value)
req = R.get("https://www.zhihu.com/explore", cookies=jar, headers=headers2)
username = re.findall(pattern2, req.text)
print(username)

['Lyole']


### Keep-Alive

In [None]:
from requests import Request, Session
R.get('http://httpbin.org/cookies/set/number/1995')
req = R.get('http://httpbin.org/cookies')
print(req.text)
# {
#   "cookies": {}
# }
s = Session()
s.get('http://httpbin.org/cookies/set/number/1995')
req= s.get('http://httpbin.org/cookies')
print(req.text)
# {
#   "cookies": {
#     "number": "1995"
#   }
# }

In [None]:
url = 'http://httpbin.org/post'
data = {
    'Saycet': '15'
} 
headers3 = {
    'User-Agent': headers["User-Agent"]
}  
s = Session()
req = Request('POST', url, data=data, headers=headers3)
prepped = s.prepare_request(req)
req = s.send(prepped)
print(req.text)
# {
#   "args": {}, 
#   "data": "", 
#   "files": {}, 
#   "form": {
#     "Saycet": "15"
#   }, 
#   "headers": {
#     "Accept": "*/*", 
#     "Accept-Encoding": "gzip, deflate", 
#     "Connection": "close", 
#     "Content-Length": "9", 
#     "Content-Type": "application/x-www-form-urlencoded", 
#     "Host": "httpbin.org", 
#     "User-Agent": "User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 YaBrowser/18.2.0.284Yowser/2.5 Safari/537.36"
#   }, 
#   "json": null, 
#   "origin": "183.246.20.118", 
#   "url": "http://httpbin.org/post"
# }

### SSL

In [None]:
response = R.get('https://www.12306.cn')
print(response.status_code)
# SSLError: HTTPSConnectionPool
response = R.get('https://www.12306.cn', verify=False)
print(response.status_code)
# 200
# Warning:  Adding certificate verification is strongly advised.

In [None]:
import logging
logging.captureWarnings(True)
response = R.get('https://www.12306.cn', verify=False)
# response = requests.get('https://www.12306.cn', cert=('/path/server.crt', '/path/key'))
print(response.status_code)
# 200

### Files

In [None]:
req= R.get("http://httpbin.org/image/png")
# print(req.text)
# print(req.content)
with open('pig.png', 'wb') as f:
    f.write(req.content)
 
# upload data
files = {'file': open('pig.png', 'rb')}
req= R.post("http://httpbin.org/post", files=files)
# print(req.text)

### Auth

In [None]:
from requests_oauthlib import OAuth1

req = R.get('url', auth=('username', 'password'))
print(r.status_code)

url = 'https://api.twitter.com/1.1/account/verify_credentials.json'
auth = OAuth1('YOUR_APP_KEY', 'YOUR_APP_SECRET',
              'USER_OAUTH_TOKEN', 'USER_OAUTH_TOKEN_SECRET')
requests.get(url, auth=auth)

## Status Code

In [None]:
_codes = {

    # Informational.
    100: ('continue',),
    101: ('switching_protocols',),
    102: ('processing',),
    103: ('checkpoint',),
    122: ('uri_too_long', 'request_uri_too_long'),
    200: ('ok', 'okay', 'all_ok', 'all_okay', 'all_good', '\\o/', '✓'),
    201: ('created',),
    202: ('accepted',),
    203: ('non_authoritative_info', 'non_authoritative_information'),
    204: ('no_content',),
    205: ('reset_content', 'reset'),
    206: ('partial_content', 'partial'),
    207: ('multi_status', 'multiple_status', 'multi_stati', 'multiple_stati'),
    208: ('already_reported',),
    226: ('im_used',),

    # Redirection.
    300: ('multiple_choices',),
    301: ('moved_permanently', 'moved', '\\o-'),
    302: ('found',),
    303: ('see_other', 'other'),
    304: ('not_modified',),
    305: ('use_proxy',),
    306: ('switch_proxy',),
    307: ('temporary_redirect', 'temporary_moved', 'temporary'),
    308: ('permanent_redirect',
          'resume_incomplete', 'resume',),  # These 2 to be removed in 3.0

    # Client Error.
    400: ('bad_request', 'bad'),
    401: ('unauthorized',),
    402: ('payment_required', 'payment'),
    403: ('forbidden',),
    404: ('not_found', '-o-'),
    405: ('method_not_allowed', 'not_allowed'),
    406: ('not_acceptable',),
    407: ('proxy_authentication_required', 'proxy_auth', 'proxy_authentication'),
    408: ('request_timeout', 'timeout'),
    409: ('conflict',),
    410: ('gone',),
    411: ('length_required',),
    412: ('precondition_failed', 'precondition'),
    413: ('request_entity_too_large',),
    414: ('request_uri_too_large',),
    415: ('unsupported_media_type', 'unsupported_media', 'media_type'),
    416: ('requested_range_not_satisfiable', 'requested_range', 'range_not_satisfiable'),
    417: ('expectation_failed',),
    418: ('im_a_teapot', 'teapot', 'i_am_a_teapot'),
    421: ('misdirected_request',),
    422: ('unprocessable_entity', 'unprocessable'),
    423: ('locked',),
    424: ('failed_dependency', 'dependency'),
    425: ('unordered_collection', 'unordered'),
    426: ('upgrade_required', 'upgrade'),
    428: ('precondition_required', 'precondition'),
    429: ('too_many_requests', 'too_many'),
    431: ('header_fields_too_large', 'fields_too_large'),
    444: ('no_response', 'none'),
    449: ('retry_with', 'retry'),
    450: ('blocked_by_windows_parental_controls', 'parental_controls'),
    451: ('unavailable_for_legal_reasons', 'legal_reasons'),
    499: ('client_closed_request',),

    # Server Error.
    500: ('internal_server_error', 'server_error', '/o\\', '✗'),
    501: ('not_implemented',),
    502: ('bad_gateway',),
    503: ('service_unavailable', 'unavailable'),
    504: ('gateway_timeout',),
    505: ('http_version_not_supported', 'http_version'),
    506: ('variant_also_negotiates',),
    507: ('insufficient_storage',),
    509: ('bandwidth_limit_exceeded', 'bandwidth'),
    510: ('not_extended',),
    511: ('network_authentication_required', 'network_auth', 'network_authentication'),
}


## PQuery

Ok lets try download some data from https://www.zhihu.com/explore

![zhihu explore](http://oxswstz0r.bkt.clouddn.com/Zhihu.PNG)

In [19]:
from pyquery import PyQuery as pq
 
headers = {
    'Cookie': '_zap=400b2450-a8d3-4e2c-9157-3961f50262c3; d_c0="AFACfRAuQgyPTsGYfK3nqLA1fGlZo6FhSDA=|1503379690"; q_c1=3b17d61410ee44acaf64653951a689c5|1508817203000|1503407076000; __utma=51854390.305172480.1512467100.1512467100.1512467100.1; __utmz=51854390.1512467100.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=51854390.100--|2=registration_date=20170420=1^3=entry_date=20170420=1; z_c0="2|1:0|10:1516533477|4:z_c0|92:Mi4xOVNXX0JBQUFBQUFBVUFKOUVDNUNEQ1lBQUFCZ0FsVk41Y1JSV3dCVTMtVEVTdXRzNVNqeXBTM1F2T3BTeXJWYl9R|5bb9207c1f2f31b36435d16cdd9ed3d98a085ffe20ce1cdae142bc2f576e0317"; q_c1=3b17d61410ee44acaf64653951a689c5|1520309043000|1503407076000; aliyungf_tc=AQAAAAa7ynFbXAUAcJzEcwalmjCy9Dhx; _xsrf=d4b95018-fa55-4f74-9f04-da741c27e36b',
    'Host': 'www.zhihu.com',
    'User-Agent': 'User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 YaBrowser/18.2.0.284Yowser/2.5 Safari/537.36'
}
req = R.get("https://www.zhihu.com/explore", headers=headers).text
doc = pq(req)
items = doc('.explore-tab .feed-item').items()
for item in items:
    question = item.find('h2').text()
    print(question)
    author = item.find('.author-link-line').text()
    answer = pq(item.find('textarea').html()).text()
    file = open('Zhihu.txt', 'a', encoding='utf-8')
    file.write('\n'.join([question, author, answer]))
    file.write('\n' + '-' * 100 + '\n')
    file.close()

女生讨厌或不欣赏女生哪些行为？
作为医生看见面前一个暂时活蹦乱跳的喝了百草枯的病人是怎么的一种感觉？
如何看待外媒称日本以研修生名义,骗越南人到福岛清理核垃圾?
为什么大家肯定话剧演员的演技，但是越来越少的人去看话剧？85后为什么不愿意去剧院看话剧？
能不能推荐一些适合做壁纸的名画？
有哪些眼前一亮的暗中称妙的成语？
为了高考你有多拼命？
中国有哪些值得拍成影视剧的历史或人物？
乌克兰是不是真的很穷？
在p社游戏过程中，有哪些让你对现实产生思考的时候？


## REFERENCES
- http://www.python-requests.org/en/master/
- http://docs.python-requests.org/zh_CN/latest
- https://github.com/requests/requests/blob/master/requests/status_codes.py