In [None]:
import concurrent
import requests
import json
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor


def save_data_to_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)


def download_and_save_page(page_number, progress_df):
    url = f'https://www.fontshop.com/search_data.json?page={page_number}&size=200&fields=typeface_data,opentype_features'
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        filename = os.path.join('data', f'data_page_{page_number}.json')
        save_data_to_json(data, filename)
        print(f"Page {page_number} saved.")
        progress_df.at[page_number - 1, 'Completed'] = True
        progress_df.to_csv('progress.csv', index=False)
    except requests.exceptions.HTTPError as err:
        print(f"Error downloading page {page_number}: {err}")
        progress_df.at[page_number - 1, 'Completed'] = False
        progress_df.to_csv('progress.csv', index=False)


def main():
    total_hit = 57438
    total_page = round(total_hit / 200)

    if not os.path.exists('data'):
        os.makedirs('data')

    progress_df = pd.DataFrame(columns=['Page', 'Completed'])
    if os.path.exists('progress.csv'):
        progress_df = pd.read_csv('progress.csv')
    else:
        progress_df['Page'] = range(1, total_page + 1)
        progress_df['Completed'] = False

    with ThreadPoolExecutor(max_workers=10) as executor:
        pages_to_download = progress_df[progress_df['Completed'] == False]['Page']
        future_to_page = {executor.submit(download_and_save_page, page, progress_df): page for page in
                          pages_to_download}

        for future in concurrent.futures.as_completed(future_to_page):
            page = future_to_page[future]
            # Process the future's result if needed

    print("All pages saved.")


if __name__ == "__main__":
    main()


In [7]:
import os
import json

folder_path = 'data'  # Đường dẫn đến thư mục chứa các tệp JSON
json_data = []

list_json_files = os.listdir(folder_path)
list_json_files = [file for file in list_json_files if file.endswith('.json')]


In [12]:
import os
import json
import concurrent.futures

def process_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        hits = data["families"]["hits"]["hits"]
        
        results = []
        for hit in hits:
            typeface_data = hit["_source"]["typeface_data"]
            name = hit["_source"]["name"]
            images = hit["_source"]["images"]
            designers = hit["_source"]["designers"]
            
            result = {
                "typeface_data": typeface_data,
                "name": name,
                "images": images,
                "designers": designers
            }
            results.append(result)
        
        # Lưu dữ liệu vào file kết quả
        result_file_path = os.path.join('result', f'{os.path.basename(file_path)}.result.json')
        with open(result_file_path, 'w', encoding='utf-8') as result_file:
            json.dump(results, result_file, ensure_ascii=False, indent=4)

def main():
    folder_path = 'data'  # Đường dẫn đến thư mục chứa các tệp JSON
    list_json_files = os.listdir(folder_path)
    list_json_files = [file for file in list_json_files if file.endswith('.json')]
    
    os.makedirs('result', exist_ok=True)  # Tạo thư mục 'result' nếu chưa tồn tại
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(process_json_file, [os.path.join(folder_path, file) for file in list_json_files])

if __name__ == "__main__":
    main()


57400
