In [1]:
import os
import re
import json
import requests
import threading
from tqdm import tqdm  # 导入 tqdm 库
from concurrent.futures import ThreadPoolExecutor

In [2]:
# 设置标头信息
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
}

In [3]:

jsonDir = "/home/ubuntu/crawler/products_info.json"
with open(jsonDir, 'r') as file:
    data = json.load(file)  # 读取 JSON 数据


In [4]:
count = 0  # 计数器
lock = threading.Lock()  # 创建锁

In [5]:
def fetch_and_process_url(url):
    # 发送请求
    response = requests.get(url, headers=headers)

    # 检查请求是否成功
    if response.status_code == 200:
        # 使用正则表达式匹配并提取第二个div内容
        divs = re.findall(r'<div class="F_yfF">(.*?)</div>', response.text, re.DOTALL)
        # 提取第二个div的内容
        if len(divs) > 1:
            div = divs[1]
            text = re.sub(r'<.*?>', '', div)  # 用空格替换标签
            return text.strip()
        else:
            return None
    else:
        return None

In [6]:
def write_info(product_id, product_info):
    global count
    # 提取 URL
    url = product_info['url']  # 提取 URL
    name = product_info['name']  # 提取名称
    price = product_info['price']  # 提取价格
    url = fetch_and_process_url(url)
    output_dir = f'/home/ubuntu/crawler/info/{product_id}'
    os.makedirs(output_dir, exist_ok=True)
    with open(os.path.join(output_dir, 'info.txt'), 'w') as f:
        f.write(f'Name: {name}\n')
        f.write(f'Price: {price}\n')
        f.write(f'Brand: {url}\n')
    with lock:
        count += 1

In [7]:
with ThreadPoolExecutor() as executor:
    with tqdm(total=len(data.items()), desc="Processing URLs") as pbar:
        futures = {executor.submit(write_info, product_id, url): product_id for product_id, url in data.items()}

        # 等待所有任务完成
        for future in futures:
            future.result()  # 获取结果，确保异常被抛出
            pbar.update(1)  # 更新进度条

Processing URLs: 100%|████████████████████████| 998/998 [01:44<00:00,  9.58it/s]
