In [1]:
import os
import json
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

In [3]:
PROGRESS_FILE = 'progress.json'
THREAD_COUNT = 10  # Số luồng xử lý song song
lock = threading.Lock()  # Để tránh xung đột khi cập nhật tiến trình


def load_progress():
    """Tải trạng thái tiến trình từ file"""
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r', encoding='utf-8') as file:
            return json.load(file)
    return {'completed_files': [], 'last_processed_record': {}}


def save_progress(progress):
    """Lưu trạng thái tiến trình vào file"""
    with lock:  # Đảm bảo không có xung đột ghi dữ liệu
        with open(PROGRESS_FILE, 'w', encoding='utf-8') as file:
            json.dump(progress, file, ensure_ascii=False, indent=4)


def extract_product_name(url):
    """Crawl product name từ URL"""
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Thay đổi selector nếu cần thiết
            product_name = soup.find('h1').text.strip()  # Giả sử tên sản phẩm nằm trong thẻ <h1>
            return product_name
        else:
            print(f"Failed to fetch URL: {url}, Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching URL: {url}, Error: {e}")
        return None


def process_record(record, file_name, index):
    """Xử lý một bản ghi"""
    url = record.get('current_url', '')
    if url:
        product_name = extract_product_name(url)
        if product_name:
            record['product_name'] = product_name

    # Cập nhật trạng thái tiến trình
    progress = load_progress()
    if 'last_processed_record' not in progress:
        progress['last_processed_record'] = {}
    progress['last_processed_record'][file_name] = index + 1
    save_progress(progress)


def process_json_file(file_path, file_name, start_index):
    """Xử lý từng file JSON"""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    with ThreadPoolExecutor(max_workers=THREAD_COUNT) as executor:
        futures = []
        for i, record in enumerate(data[start_index:], start=start_index):
            futures.append(executor.submit(process_record, record, file_name, i))

        for future in as_completed(futures):
            future.result()  # Xử lý lỗi nếu có

    # Ghi lại file JSON sau khi xử lý
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)


def process_json_files(folder_path):
    """Xử lý tất cả các file JSON trong thư mục"""
    progress = load_progress()
    completed_files = progress.get('completed_files', [])
    last_processed_record = progress.get('last_processed_record', {})

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json') and file_name not in completed_files:
            file_path = os.path.join(folder_path, file_name)
            print(f"Processing file: {file_name}")

            start_index = last_processed_record.get(file_name, 0)
            process_json_file(file_path, file_name, start_index)

            # Đánh dấu file đã xử lý xong
            with lock:
                completed_files.append(file_name)
                last_processed_record.pop(file_name, None)
                save_progress({
                    'completed_files': completed_files,
                    'last_processed_record': last_processed_record
                })
            print(f"Completed file: {file_name}")


# Sử dụng
folder_path = '/Users/giakhanh/Desktop/Data Engineer/glamira_project/project/IP_Glamira_data'  # Thay bằng đường dẫn thư mục chứa file JSON
process_json_files(folder_path)

Processing file: 79_50_119_237.json


KeyboardInterrupt: 