In [45]:
# setting.py 为项目配置文件

# 配置API服务

HOST = "0.0.0.0"               # IP
PORT = 5000                    # 监听端口


# 配置数据库

DB_CONN = 'redis://:pwd@127.0.0.1:8888/0'


# 配置 ProxyFetcher

PROXY_FETCHER = [
    "freeProxy01",      # 这里是启用的代理抓取方法名，所有fetch方法位于fetcher/proxyFetcher.py
    "freeProxy02",
    "freeProxy03",
    "freeProxy04",
    "freeProxy05",
    "freeProxy06",
    "freeProxy07",
    "freeProxy08",
    "freeProxy09",
    "freeProxy10",
    "freeProxy11"
    # ....
]

In [40]:
# -*- coding:utf-8 -*-
import hashlib
import requests
import json
import pandas as pd
from fake_useragent import UserAgent
import time
import random
import csv



with open("Free_Proxy_List.csv", "r") as file:
    csv_reader = csv.reader(file)
    headers = next(csv_reader)
    ip_index = headers.index("ip")
    ip_addresses = [row[ip_index] for row in csv_reader]

In [69]:

# 获得ip
def get_proxy():
    return requests.get("http://127.0.0.1:5010/get/").json()


def delete_proxy(proxy):
    requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))


def myhash(data, flag="md5"):  ##加密文本
    """
    hash 函数 默认 md5 可以使用 sha1 等
    :param data: 待加密的文本
    :param flag: md5/sha1/sha256等
    :return:
    """
    if hasattr(hashlib, flag):
        my_hash = getattr(hashlib, flag)()
    else:

        return
    my_hash.update(data.encode("utf8"))
    hash_data = my_hash.hexdigest()
    return hash_data


# 提取参数
def extract_batch_data(school_id):
    url = f'https://static-data.gaokao.cn/www/2.0/school/{school_id}/dic/specialplan.json'

    # 发起 HTTP 请求获取 specialscore.json 的 XHR 响应
    response = requests.get(url)

    # 检查响应状态
    if response.status_code != 200:
        print(f"Failed to retrieve data for school {school_id}")
        return None

    # 将响应的 JSON 数据解析为 Python 对象
    data = json.loads(response.text)

    # 提取 batch 数据
    batch_data = data['data']['newsdata']['batch']

    # 创建空的 DataFrame
    df = pd.DataFrame(columns=['province', 'year', 'data_type', 'batch'])

    # 遍历 batch 数据
    for key, value in batch_data.items():
        province, year, data_type = key.split('_')
        df = pd.concat([df, pd.DataFrame({
            'province': [province] * len(value),
            'year': [year] * len(value),
            'data_type': [data_type] * len(value),
            'batch': value
        })])

    # 重置索引
    df.reset_index(drop=True, inplace=True)

    return df


def get_data(school_id, time_str, local_id, batch_id, type_id, page):
    headers = {
        "authority": "api.zjzw.cn",
        "accept": "application/json, text/plain, */*",
        "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7",
        "cache-control": "no-cache",
        "content-type": "application/json",
        "origin": "https://www.gaokao.cn",
        "pragma": "no-cache",
        "referer": "https://www.gaokao.cn/",
        "sec-ch-ua": "\"Google Chrome\";v=\"119\", \"Chromium\";v=\"119\", \"Not?A_Brand\";v=\"24\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "cross-site",
        "user-agent": UserAgent().random,
    }
    url = "https://api.zjzw.cn/web/api/"

    # time.sleep(1)
    params = {
        "local_batch_id": f"{batch_id}",
        "local_province_id": f"{local_id}",
        "local_type_id": f"{type_id}",
        "page": f"{page}",
        "school_id": f"{school_id}",
        "size": "10",
        "special_group": "",
        "uri": "apidata/api/gkv3/plan/school",
        "year": f"{time_str}",
        "signsafe": "e0f93d59bf0b7ce73880e88fdb1fe67c"  # 替换成你自己登录的结果
    }
    data = {
        "local_batch_id": batch_id,
        "local_province_id": local_id,
        "local_type_id": type_id,
        "page": page,
        "school_id": school_id,
        "signsafe": "e0f93d59bf0b7ce73880e88fdb1fe67c",  # 替换成你自己登录的结果
        "size": 10,
        "special_group": "",
        "uri": "apidata/api/gkv3/plan/school",
        "year": int(time_str)
    }
    data = json.dumps(data, separators=(',', ':'))

    # 使用代理访问，访问三次失败切换并删除该ip地址
    global response
    retry_count = 3
    proxy = get_proxy()
    while retry_count > 0:
        try:
            response = requests.post(url, headers=headers, params=params, data=data,
                                     proxies={"http": "http://{}".format(proxy)}).json()

            # 如果提示访问太过频繁，切换ip
            if response['code'] != '0000':
                print(response)
                print("deleted",roxy)
                delete_proxy
                proxy = get_proxy()
                print("new",proxy)
                print("change new id")
                headers.update({"user-agent": UserAgent().random})
                headers.update({"Proxy-Tunnel": str(random.randint(1, 10000))})
                continue
            

            print(proxy)
            result_list = []
            items = response.get('data', {}).get('item', [])
            for item in items:
                # print(item)
                info = {}
                info['省份'] = local_id
                info['学校'] = item['name']
                info['学校代码'] = item['school_id']
                info['年份'] = item['year']
                info['科目'] = item['local_type_name']
                info['类型'] = item['local_batch_name']
                info['专业名称'] = item['spname']
                info['计划招生'] = item['num']
                info['学制'] = item['length']
                info['学费'] = item['tuition']
                info["_id"] = myhash(json.dumps(info))
                # print(info)
                result_list.append(info)

            return result_list
        except Exception:
            retry_count -= 1

    # 删除代理池中代理
    if retry_count == 0:
        delete_proxy(proxy)



In [66]:

start_time = time.time()
school_id = 2691
df = extract_batch_data(school_id)
print(df)
df.to_csv('参数.csv', index=False)

   province  year data_type batch
0        46  2023         3    10
1        46  2022         3    10
2        46  2021         3    10
3        46  2020         3    10
4        46  2019         1    10
..      ...   ...       ...   ...
72       62  2020         2    10
73       62  2019         2    10
74       62  2018         2    10
75       13  2018         1    10
76       13  2018         2    10

[77 rows x 4 columns]


In [73]:
result_df = pd.DataFrame(
    columns=['省份', '学校', '学校代码', '年份', '科目', '类型', '专业名称', '计划招生', '学制', '学费', '_id'])
n = 1
for index, row in df.iterrows():
    province = row['province']
    year = row['year']
    data_type = row['data_type']
    batch = row['batch']
    # 在这里执行你的操作，例如打印每条数据
    print(f"Province: {province}, Year: {year}, Type: {data_type}, Batch: {batch}")
    for page in range(1, 6):
        result = get_data(school_id, year, province, batch, data_type, page)
        print(result)
        # 执行需要的操作，如果有的话
        if not result:
            break
        result_df = pd.concat([result_df, pd.DataFrame(result)])
        time.sleep(random.uniform(1,2))
    # Save the result_df to a CSV file
result_df.to_csv('school_id.csv', index=False)

end_time = time.time()
minutes, seconds = divmod(end_time - start_time, 60)
print(f"运行时间：{round(minutes)} 分钟 {round(seconds)} 秒")


Province: 46, Year: 2023, Type: 3, Batch: 10
{'anonymous': '', 'check_count': 22, 'fail_count': 0, 'https': False, 'last_status': True, 'last_time': '2023-12-14 19:31:29', 'proxy': '111.40.62.199:9091', 'region': '中国 黑龙江 大庆 移动', 'source': 'freeProxy03'}
[{'省份': '46', '学校': '重庆交通职业学院', '学校代码': '2691', '年份': '2023', '科目': '综合', '类型': '专科批', '专业名称': '道路与桥梁工程技术', '计划招生': '20', '学制': '3', '学费': '11800', '_id': '6dcc057b52a89fece8eca118e0e7d189'}]
{'anonymous': '', 'check_count': 10, 'fail_count': 0, 'https': False, 'last_status': True, 'last_time': '2023-12-14 19:31:47', 'proxy': '47.96.143.117:80', 'region': '中国 浙江 杭州 阿里云/电信/联通/移动/铁通/教育网', 'source': 'freeProxy10'}
[]
Province: 46, Year: 2022, Type: 3, Batch: 10
{'anonymous': '', 'check_count': 22, 'fail_count': 0, 'https': False, 'last_status': True, 'last_time': '2023-12-14 19:31:29', 'proxy': '111.40.62.199:9091', 'region': '中国 黑龙江 大庆 移动', 'source': 'freeProxy03'}
[{'省份': '46', '学校': '重庆交通职业学院', '学校代码': '2691', '年份': '2022', '科目': '综合', 