In [1]:
import pandas as pd
import httpx
import asyncio
import time
from pathlib import Path
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from tqdm.notebook import tqdm

import pyarrow as pa
import pyarrow.parquet as pq
import math

In [2]:
# Настройки
DATA_PATH = Path('../data/raw/html/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

METADATA_PATH = Path('../data/metadata/arxiv_NLP_2025_metadata.csv')

In [3]:
# Загрузка данных
df = pd.read_csv(METADATA_PATH, usecols=['arxiv_id', 'title', 'html_url'])
print(f'Данные загружены. Всего {df.shape[0]} статей.')

ids = df['arxiv_id'].to_list()
ids = set(ids)

downloaded_ids = set()
for f in DATA_PATH.glob('*.html'):
    downloaded_ids.add(f.stem)

ids_to_download = df[~df['arxiv_id'].isin(downloaded_ids)]
print(f'Скачанных статей: {len(downloaded_ids)}. Осталось статей: {ids_to_download.shape[0]}')

Данные загружены. Всего 17081 статей.
Скачанных статей: 14594. Осталось статей: 2487


In [4]:
SEMAPHORE = asyncio.Semaphore(10)

async def download_article(client, row, pbar):
    async with SEMAPHORE:
        try:
            response = await client.get(row.html_url)
            
            if response.status_code == 200:
                paper_id = row.arxiv_id
                filename = DATA_PATH / f'{paper_id}.html'
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(response.text)
            
        except Exception as e:
            print(e)
            pass
        finally:
            pbar.update(1)

In [5]:
async def main():
    limits = httpx.Limits(max_keepalive_connections=10, max_connections=20)
    
    async with httpx.AsyncClient(limits=limits, timeout=20.0, follow_redirects=True) as client:
        tasks = []
        
        pbar = tqdm(total=len(ids_to_download))
        
        for row in ids_to_download.itertuples():
            task = download_article(client, row, pbar)
            tasks.append(task)
        
        await asyncio.gather(*tasks)
        pbar.close()

await main()

downloaded_ids = set()
for f in DATA_PATH.glob('*.html'):
    downloaded_ids.add(f.stem)

ids_to_download = df[~df['arxiv_id'].isin(downloaded_ids)]
len_all = len(ids)
len_dwd = len(downloaded_ids)
not_dwd_pct = ((len_all - len_dwd) / len_all) * 100
print(f'Скачано {len_dwd} из {len_all} статей. Процент нескачанных статей: {not_dwd_pct:.2f}%')

  0%|          | 0/2487 [00:00<?, ?it/s]

Скачано 14680 из 17081 статей. Процент нескачанных статей: 14.06%


In [6]:
SIZE_THRESHOLD = 20 * 1024

for file in DATA_PATH.glob('*.html'):
    if file.is_file():
        # Сначала получаем размер
        file_size = file.stat().st_size 
        
        if file_size < SIZE_THRESHOLD:
            try:
                file.unlink()
                print(f"Удален: {file.name} ({file_size} bytes)")
            except Exception as e:
                print(f"Ошибка при удалении {file.name}: {e}")

Удален: 2501.01069v1.html (8774 bytes)
Удален: 2501.07300v1.html (11644 bytes)
Удален: 2501.07875v1.html (14213 bytes)
Удален: 2501.08613v3.html (9565 bytes)
Удален: 2501.11478v3.html (7220 bytes)
Удален: 2501.11712v1.html (3042 bytes)
Удален: 2501.14002v3.html (11805 bytes)
Удален: 2501.14144v1.html (2808 bytes)
Удален: 2501.14844v2.html (16691 bytes)
Удален: 2501.15021v1.html (2808 bytes)
Удален: 2501.18633v1.html (3754 bytes)
Удален: 2501.18837v1.html (2388 bytes)
Удален: 2502.07677v3.html (18341 bytes)
Удален: 2502.08923v2.html (7220 bytes)
Удален: 2502.15666v2.html (7220 bytes)
Удален: 2502.17308v2.html (14990 bytes)
Удален: 2502.19735v3.html (7220 bytes)
Удален: 2503.00203v3.html (20326 bytes)
Удален: 2503.01742v2.html (3831 bytes)
Удален: 2503.03601v1.html (8627 bytes)
Удален: 2503.03705v2.html (2691 bytes)
Удален: 2503.08919v1.html (2782 bytes)
Удален: 2503.10460v4.html (7220 bytes)
Удален: 2503.18085v2.html (2691 bytes)
Удален: 2503.20062v1.html (2965 bytes)
Удален: 2504.00187

# Собираем Parquet для хранения в S3

In [7]:
RAW_HTML_DIR = Path('../data/raw/html').resolve()
RAW_PARQUET_DIR = Path("../data/raw/parquet").resolve()
RAW_PARQUET_DIR.mkdir(parents=True, exist_ok=True)
SHARD_SIZE = 1000

html_files = list(RAW_HTML_DIR.glob("*.html"))
total_shards = math.ceil(len(html_files) / SHARD_SIZE)

In [8]:
schema = pa.schema([
    ('doc_id', pa.string()),
    ('html', pa.string()),
    ('source_path', pa.string())
])

In [9]:
def process_shards(files, shard_size, dest_dir):
    for i in tqdm(range(0, len(files), shard_size), desc="Processing Shards"):
        batch_files = files[i : i + shard_size]
        batch_data = []
        
        for f in batch_files:
            try:
                content = f.read_text(encoding="utf-8")
                batch_data.append({
                    "doc_id": f.stem,
                    "html": content,
                    "source_path": str(f)
                })
            except Exception as e:
                print(f"Ошибка в файле {f.name}: {e}")

        if batch_data:
            table = pa.Table.from_pylist(batch_data, schema=schema)
            shard_name = f"shard_{i // shard_size:04d}.parquet"
            pq.write_table(table, dest_dir / shard_name)

In [10]:
process_shards(html_files, SHARD_SIZE, RAW_PARQUET_DIR)

Processing Shards:   0%|          | 0/15 [00:00<?, ?it/s]

Ошибка в файле 2503.19134v1.html: [Errno 22] Invalid argument: 'D:\\GitHub\\ArXiv_Info_System\\data\\raw\\html\\2503.19134v1.html'
Ошибка в файле 2510.24932v1.html: [Errno 2] No such file or directory: 'D:\\GitHub\\ArXiv_Info_System\\data\\raw\\html\\2510.24932v1.html'
