In [2]:
# import packages
import requests
import random
import json
import time
import re
import pandas as pd
import os
from datetime import datetime
from seleniumwire import webdriver
from seleniumwire.utils import decode
from selenium.webdriver.edge.options import Options

In [3]:
# 创建一个 Chrome 配置对象
edge_options = Options()

# 设置无头模式
edge_options.add_argument("--headless")
edge_options.add_argument("--verbose")

In [4]:
# 爬取所有基金公司信息
personOrgList = pd.DataFrame()
for num in range(8):  # 机构列表页面共8页
    driver = webdriver.Edge(options=edge_options)
    # 设置一个请求拦截器
    def request_interceptor(request):
        # 找到personOrgList请求
        if 'rand=' in request.url:
            # 修改请求体（通常AJAX请求体是JSON格式）
            params = request.params
            params['page'] = str(num)  # 修改页数
            request.params = params

    # 配置 WebDriver 以使用拦截器
    driver.request_interceptor = request_interceptor

    # 导航到目标网站
    driver.get('https://gs.amac.org.cn/amac-infodisc/res/pof/person/personOrgList.html')
    time.sleep(5)  #减少服务器短时负载 

    # 爬取页面内容
    for request in driver.requests:
        if request.response:
            if 'rand=' in request.url:
                content = request.response.body.decode('utf-8')
            else:
                pass

    # 关闭浏览器
    driver.quit()

    # 从响应内容中找到personOrgList
    personOrgList_ = pd.DataFrame(json.loads(content)['content'])
    personOrgList = pd.concat([personOrgList, personOrgList_], axis=0)

In [12]:

personOrgList.reset_index(drop=True, inplace=True)  #拼接后index较为混乱，重置index

In [None]:
personOrgList.shape  #检查基金公司数目是否正确

In [None]:
# 爬取基金经理信息（所有基金）
url_head = 'https://gs.amac.org.cn/amac-infodisc/res/pof/person/fundManagerList.html?userId='

ManagerList = pd.DataFrame()
for i in range(personOrgList.shape[0]):
    company = personOrgList.loc[i,'userId']
    name = personOrgList.loc[i,'orgName']
    manager_num = personOrgList.loc[i,'fundManagerNum']
    page_num = manager_num // 20 + 1
    url = url_head + str(company)
    ManagerList_1 = pd.DataFrame()
    for num in range(page_num):
        driver = webdriver.Edge(options=edge_options)
        # 设置一个请求拦截器
        def request_interceptor(request):
            # 找到personOrgList请求
            if 'rand=' in request.url:
                # 修改请求体（通常AJAX请求体是JSON格式）
                params = request.params
                params['page'] = str(num)
                request.params = params
        # 配置 WebDriver 以使用拦截器
        driver.request_interceptor = request_interceptor
        driver.get(url)   
        for request in driver.requests:
            if request.response:
                if 'rand=' in request.url:
                    content = request.response.body.decode('utf-8')
                else:
                    pass
        driver.quit()
        ManagerList_0 = pd.DataFrame(json.loads(content)['content'])
        ManagerList_1 = pd.concat([ManagerList_1, ManagerList_0], axis=0)
        rand = random.uniform(0.5, 1.5)
        time.sleep(rand)
    print(f'第{i+1}家基金公司{name}爬取完毕，开始数据拼接')
    ManagerList = pd.concat([ManagerList, ManagerList_1], axis=0)
    rand = random.uniform(3, 5)
    if i + 1 % 10 ==0:
        time.sleep(rand + 10)
    else:
        time.sleep(rand)

In [18]:
ManagerList['personCertHistoryList'] = [pd.DataFrame(HistoryList) for HistoryList in ManagerList['personCertHistoryList']]
ManagerList.reset_index(inplace=True, drop=True)

In [46]:
# 确认数据格式
ManagerList['accountId'] = ManagerList['accountId'].astype(str)
ManagerList['userId'] = ManagerList['userId'].astype(str)
ManagerList['bizId'] = ManagerList['bizId'].astype(str)

In [47]:
# 保存数据
ManagerList.to_excel('./Manager_list.xlsx')

In [9]:
ManagerList = pd.read_excel('./Manager_list.xlsx')
ManagerList.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
headers = {
    "Accept-Language": "zh-CN,zh;q=0.9",
    'Content-Type': 'application/json',
    'Origin': 'http://gs.amac.org.cn',
    'Referer': 'https://gs.amac.org.cn/amac-infodisc/res/pof/fund/index.html',
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Mobile Safari/537.36'
}
host = 'https://gs.amac.org.cn'
api = '/amac-infodisc/api/pof/person/'
Manager_info = pd.DataFrame()
for i in range(ManagerList.shape[0]):
    accountId = ManagerList.loc[i, 'accountId']
    rand = random.random()
    url = host + api + str(accountId) + '?rand=' + str(rand)
    if (i + 1) % 100 == 0:
        print(f"已爬取3519个基金经理中{i+1}个基金经理的相关信息，不如休息10s吧！")
        rand = random.uniform(8, 10)
        time.sleep(rand)
    else:
        pass
    response = requests.get(url=url, headers=headers)
    if response.status_code == 200:
        content = response.text
        Manager_info_ = pd.DataFrame(json.loads(content))
        Manager_info = pd.concat([Manager_info, Manager_info_], axis=0)
    else:
        name = ManagerList.loc[i, 'userName']
        print(f"{name}信息爬取出错")

In [16]:
Manager_info.reset_index(drop=True, inplace=True)

In [48]:
# 提取History信息
Manager_info['History_Company'] = [history['orgName'] for history in Manager_info['personCertHistoryList']]
Manager_info['History_Date'] = [datetime.fromtimestamp(history['creationDate']/1000).strftime("%Y-%m-%d") for history in Manager_info['personCertHistoryList']]
Manager_info['History_Status'] = [history['statusName'] for history in Manager_info['personCertHistoryList']]

Manager_info.drop('personCertHistoryList', axis=1)

In [51]:
# 保存数据:information of managers
Manager_info.to_excel('./Manager_infos.xlsx')

In [54]:
# 提取照片相关信息
Manager_photo = Manager_info[['accountId', 'userName', 'orgName', 'personPhotoBase64']].drop_duplicates()
Manager_photo.to_excel('./Manager_photo.xlsx')

In [68]:
headers = {
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    'Content-Type': 'image/png',
    'Host': 'human.amac.org.cn',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
}


In [None]:
folder = r'C:\Users\lzc19\Desktop\VSCode\Python\Jupyter notebook\爬虫\AMAC\AMAC证件照'
for i in range(Manager_photo.shape[0]):
    userName = Manager_photo.loc[i, 'userName']
    orgName = Manager_photo.loc[i, 'orgName']
    url = Manager_photo.loc[i, 'personPhotoBase64']
    pic_name = orgName + '_' + userName + '.jpg'
    print(url)
    response = requests.get(url)
    rand = random.uniform(0.075, 0.125)
    if response.status_code == 200:
        if response.content != 0:
            with open(os.path.join(folder, pic_name), 'wb') as f:
                f.write(response.content)
        else:
            print(f'{orgName}的基金经理{userName}照片缺失')
    else:
        print(f'请求错误，状态码为{response.status_code}')