# Python 爬虫基础

In [1]:
import requests

urlbin = "http://httpbin.org/post"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"
}

## 简单post

In [3]:
requests.post(urlbin, data={"name":"jim","age":"22"})

<Response [200]>

## post 文件

In [4]:
files = {'file': open("im.py","rb")}
requests.post(urlbin, files=files)

<Response [200]>

## 获取cookies

In [5]:
r = requests.get("https://baidu.com")
for key,value in r.cookies.items():
    print(key + "=" + value)

BDORZ=27315


## Session 会话维持

In [6]:
s = requests.Session()
s.get("http://httpbin.org/cookies/set/number/123345")
r = s.get("http://httpbin.org/cookies")
print(r.text)

{
  "cookies": {
    "number": "123345"
  }
}



## 忽略SSL证书验证

In [4]:
response = requests.get("https://www.12306.cn", verify=False)
print(response.status_code)

200


### 忽略警告

In [3]:
import logging
logging.captureWarnings(True)
response = requests.get("https://www.12306.cn", verify=False)
print(response.status_code)

200


### 设置本地证书

In [None]:
response = requests.get("https://www.12306.cn", cert=("/path/server.crt", "/path/key"))
print(response.status_code)

## 设置代理访问

In [None]:
proxies = {
    "http": "http://118.163.120.181:58837",
    "http": "http://user:password@host:port",
    "socks5": "socks5://user:passwd@host:port" # pip install requests[socks]
}

rs = requests.get(urlbin, proxies=proxies)
print(rs)

## 超时设置

In [None]:
ra = requests.get("https://www.taobao.com", timeout=1) # 等待1s
rb = requests.get("https://www.taobao.com", timeout=(5, 11)) # connect 1s read 11s
rc = requests.get("https://www.taobao.com", timeout=None) # 一直等待

print(ra.status_code)
print(rb.status_code)
print(rc.status_code)

## 身份认证（弹出式界面）

In [23]:
from requests.auth import HTTPBasicAuth

r = requests.get("http://192.168.5.1", auth=("root","password"))
print(r.status_code)

403


### Oauth1认证

In [None]:
# pip install requests_oauthlib

from requests_oauthlib import OAuth1

url = "https://api.twitter.com/1.1/account/verify_credentials.json"
auth = OAuth1("YOUR_APP_KEY","YOUR_APP_SECRET","USER_OAUTH_TOKEN","USER_OAUTH_TOKEN_SECRET")

requests.get(url,auth=auth)

# Requests 对象

In [None]:
from requests import Request, Session

data = {"name": "germey"}
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"
}

s = Session()
req = Request("POST", urlbin+"post", data=data, headers=headers)
prepped = s.prepare_request(req)
r = s.send(prepped)

print(r.text)

# 正则表达式

## [正则表达式规则](https://www.runoob.com/regexp/regexp-syntax.html)

In [4]:
import re

## match
> 检测正则表达式是否匹配输入字符串

In [6]:
content = "Hello 123 4567 World_This is a Regex Demo"

print(len(content))
result = re.match("^Hello\s(\d+)\s(\d+)\s\w+", content)
print(result)
print(result.group(0)) # 正则匹配的字符串
print(result.group(1)) # 第一个括号
print(result.span()) # 匹配的范围

41
<re.Match object; span=(0, 25), match='Hello 123 4567 World_This'>
Hello 123 4567 World_This
123
(0, 25)


### 修饰符
> 通过修饰符来控制匹配模式

修饰符|描述
--|--
re.I|忽略大小写
re.L|做本地化识别（locale-aware）匹配
re.M|多行匹配，影响 ^ 和 $
re.S|使 . 匹配包括换行在内的所有字符
re.U|根据Unicode字符集解析字符。这个标志影响 \w, \W, \b, \B.
re.X|该标志通过给予你更灵活的格式以便你将正则表达式写得更易于理解。

In [9]:
content = """Hello 1234567
World_This is a Regex Demo
"""

result = re.match("^He.*?(\d+).*?Demo", content, re.S)
print(result.group(1))

1234567


## search
> 扫描整个字符串并返回第一个成功的匹配

In [10]:
content = "Extra string Hello 1234567 World_This is a Regex Demo"

result = re.match("Hello.*?\d+.*?Demo", content)
print(result)

result = re.search("Hello.*?\d+.*?Demo", content)
print(result)

None
<re.Match object; span=(13, 53), match='Hello 1234567 World_This is a Regex Demo'>


In [40]:
r = requests.get("https://hitokoto.cn/").text

result = re.search('<div class="word".*?>(.*?)</div>', r)
print(result.group(1))

我怕音乐，它总是让我想起一些不该想的事。他总是会让旧伤复发。


# 一言爬取

In [1]:
import requests, re, time

for i in range(10):
    print(f"{i+1}", end="\r")
    with open("hitokoto.html", "a", encoding="UTF-8") as f:
        r = requests.get("https://hitokoto.cn/").text
        result = re.search('(<main.*>.*</main>)', r, re.S)
        f.write(result.group(1) + "\n")
    time.sleep(1)

9

## findall
> 返回一个列表，其中包含了所有成功的匹配

In [23]:
r = requests.get("https://hitokoto.cn/").text

result = re.findall('<a.*?>(.*?)</a>',r)
print(result)

['接口使用', '状态统计', '登录', '戳我添加⁄(⁄ ⁄•⁄ω⁄•⁄ ⁄)⁄', '友情链接', '关于一言...', '接口使用', '状态统计', '登录', '想要添加一言？戳我戳我 ⁄(⁄ ⁄•⁄ω⁄•⁄ ⁄)⁄', '友情链接', '赞助我们', '关于一言...', '激活播放器', 'DogeCloud', '沪ICP备16031287号-1', '交流讨论']


## sub
> 修改字符串中的匹配项

In [21]:
content = "54aKS4yrsoiRS4ixSL2g"
content = re.sub("\d+", "", content)
print(content)

aKSyrsoiRSixSLg


## complie
> 将正则字符串编译成Pattern对象

In [25]:
content1 = "2016-12-12 12:12:12"
content2 = "2018-12-12 12:12:12"
content3 = "2020-12-12 12:12:12"

pattern = re.compile("\d{2}:\d{2}:\d{2}")

result1 = re.sub(pattern, "", content1)
result2 = re.sub(pattern, "", content2)
result3 = re.sub(pattern, "", content3)
print(result1, result2, result3)

2016-12-12  2018-12-12  2020-12-12 


## 抓取豆瓣电影排行

In [None]:
dou_url = "https://movie.douban.com/chart"
response = requests.get(dou_url, headers=headers)
if response.status_code == 200:
    print(response.text)

pattern = re.compile('<div class="p12">', re.S)