# <center> **Milestone 1 ‚Äî Module 1: The Downloader (arXiv TeX)**
**Student (Representative):** Vu Tuan Hung 

**Student ID:** 22127137  

**Range:** 2022-11-13747 ‚Üí 2022-12-11475


**Optimizations:**
- Parallel download with ThreadPoolExecutor (tuned for MacBook Pro M4 Pro, 24GB RAM).
- Resume-safe: skip parts already complete (size matches S3); re-download only mismatched/zero-byte files.
- Extract-once with `.done` marker.
- Skip processing for papers already produced (`metadata.json` & `references.json`).
- Live counters: parts left per month and in total.

## **Download and Import libraries**

In [532]:
# 1. Install dependencies
!pip -q install boto3 arxiv requests tqdm python-dateutil sickle filetype


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [533]:
# 2. Imports & constants
from __future__ import annotations
import os, re, tarfile, shutil, json, time, subprocess, threading
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any, Tuple, Iterable

from tqdm import tqdm
from dateutil import parser as dtparser
from concurrent.futures import ThreadPoolExecutor, as_completed

try:
    import boto3  # kh√¥ng b·∫Øt bu·ªôc
    from botocore.config import Config as BotoConfig
except Exception:
    boto3 = None

import requests, arxiv, gzip, filetype, io, struct, hashlib, unicodedata
import xml.etree.ElementTree as ET
from sickle import Sickle

## **Config and constants**

In [534]:
ARXIV_S3_BUCKET = "arxiv"
ARXIV_S3_PREFIX = "src"   # s3://arxiv/src/arXiv_src_YYMM_XXX.tar
PRINT_LOCK = threading.Lock()

# 3. Config
START_MONTH, START_ID = "2022-11", 13747
#END_MONTH,   END_ID   = "2022-12", 11475
END_MONTH,   END_ID   = "2022-11", 13755

# Khuy·∫øn ngh·ªã theo m·∫°ng c·ªßa b·∫°n (Downlink ~350 Mb/s):
MAX_WORKERS = 4       # th·ª≠ 12; n·∫øu dao ƒë·ªông/l·ªói v·∫∑t, h·∫° 10
CHUNK_MBPS_EST = 11.0  # MB/s ƒë·ªÉ ∆∞·ªõc l∆∞·ª£ng ETA (kh√¥ng ·∫£nh h∆∞·ªüng logic)

OUT_DIR  = Path("output_data")
WORK_DIR = Path("workdir")
MAX_REFS = 100

## **Crawl metadata**

### **Utility Functions**

In [535]:
def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

In [536]:
def month_to_yymm(month_str: str):
    return dtparser.parse(month_str + "-01").strftime("%y%m")

In [537]:
def base_id_to_tuple(base_id: str):
    m = re.fullmatch(r"(\d{4})\.(\d{4,5})", base_id)
    if not m: raise ValueError(f"Bad arXiv base id format: {base_id}")
    return int(m.group(1)), int(m.group(2))

In [538]:
def within_range(yyyymm: int, iid: int, start_yyyymm: int, start_id: int, end_yyyymm: int, end_id: int):
    if yyyymm < start_yyyymm or yyyymm > end_yyyymm: return False
    if yyyymm == start_yyyymm and iid < start_id:     return False
    if yyyymm == end_yyyymm   and iid > end_id:       return False
    return True

In [539]:
def _atomic_write_json(path: Path, data: Any, indent: int = 2):
    """Ghi JSON an to√†n: ghi v√†o file t·∫°m r·ªìi replace -> tr√°nh file b·ªã h·ªèng n·∫øu ƒëang ghi m√† crash."""
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + ".tmp")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=indent)
    tmp.replace(path)

In [540]:
def save_state(path: Path, state: Dict[str, Any]):
    """L∆∞u state (ghi an to√†n)."""
    _atomic_write_json(path, state)

In [541]:
metadatas_dir = WORK_DIR / "metadata"
papers_dir  = WORK_DIR / "paper"
extracted = WORK_DIR / "extracted"

ensure_dir(metadatas_dir)
ensure_dir(papers_dir)
ensure_dir(extracted)

## **Crawling metadata**

In [542]:
def fetch_metadata_worker(result: arxiv.Result,
                          client: arxiv.Client,
                          checkpoint_folder: Path,
                          query_delay: float = 3.0):
    """
    Worker ƒë·ªÉ t·∫£i metadata c·ªßa m·ªôt b√†i b√°o duy nh·∫•t.
    Ghi metadata tr·ª±c ti·∫øp v√†o th∆∞ m·ª•c checkpoint_folder (kh√¥ng t·∫°o th∆∞ m·ª•c l·ªìng).
    """

    try:
        # L·∫•y arXiv ID v√† s·ªë phi√™n b·∫£n
        arxiv_id = result.get_short_id()
        v_pos = arxiv_id.rfind('v')
        version_counts = int(arxiv_id[v_pos + 1:])
        base_id = arxiv_id[:v_pos]  # ID g·ªëc, kh√¥ng g·ªìm version

        # ƒê∆∞·ªùng d·∫´n metadata.json trong checkpoint_folder hi·ªán t·∫°i
        meta_path = checkpoint_folder / "metadata.json"

        # N·∫øu file ƒë√£ t·ªìn t·∫°i ‚Üí b·ªè qua
        if meta_path.exists():
            print(f"Metadata already exists for {base_id}")
            return 1

        print(f"Creating metadata for {base_id}...")

        # Kh·ªüi t·∫°o dictionary metadata c∆° b·∫£n
        metadata = {
            "id": base_id,
            "title": result.title,
            "authors": [author.name for author in result.authors],
            "abstract": result.summary,
            "publication_venue": [],
            "categories": result.categories,
            "submission_date": result.published.strftime("%Y-%m-%d"),
            "revised_dates": [],
            "dois": [],
            "comments": [],
            "pdf_urls": [],
        }

        # L·∫•y th√¥ng tin t·∫•t c·∫£ c√°c version c·ªßa paper
        sub_search = arxiv.Search(
            id_list=[f"{base_id}v{ver}" for ver in range(1, version_counts + 1)]
        )
        time.sleep(query_delay)

        for sub_result in client.results(sub_search):
            # C·∫≠p nh·∫≠t metadata t·ª´ t·ª´ng version
            metadata["authors"] = list(set(metadata["authors"] + [a.name for a in sub_result.authors]))
            if sub_result.journal_ref:
                metadata["publication_venue"].append(sub_result.journal_ref)
            metadata["revised_dates"].append(sub_result.updated.strftime("%Y-%m-%d"))
            if sub_result.doi:
                metadata["dois"].append(sub_result.doi)
            if sub_result.comment:
                metadata["comments"].append(sub_result.comment)
            metadata["pdf_urls"].append(sub_result.pdf_url)

        # Lo·∫°i b·ªè tr√πng l·∫∑p
        metadata["publication_venue"] = list(set(metadata["publication_venue"]))
        metadata["dois"] = list(set(metadata["dois"]))

        # Ghi tr·ª±c ti·∫øp v√†o checkpoint_folder, KH√îNG t·∫°o th√™m th∆∞ m·ª•c ID
        save_state(meta_path, metadata)
        print(f"Fetched metadata for arXiv ID {base_id}")
        return 1

    except Exception as e:
        print(f"[WARN] Failed to fetch metadata for {result.get_short_id()}: {e}")
        return 0


In [543]:
def load_checkpoint_state(checkpoint_folder: Optional[Path], end_yymm: int, end_id: int):
    """
    Ki·ªÉm tra v√† load checkpoint g·∫ßn nh·∫•t, tr·∫£ v·ªÅ:
    (start_yymm, id_num, checkpoint_id, total_time, last_yymm)
    """
    id_num = 1
    checkpoint_id = 1
    total_time = 0
    start_yymm = None

    if not checkpoint_folder or not checkpoint_folder.exists():
        return None, id_num, checkpoint_id, total_time, None

    checkpoints = list(checkpoint_folder.glob("metadata_checkpoint_*.json"))
    if not checkpoints:
        return None, id_num, checkpoint_id, total_time, None

    latest = max(checkpoints, key=lambda p: int(p.stem.split('_')[-1].split('.')[0]))
    print(f"Resuming from checkpoint: {latest}")
    checkpoint_id = int(latest.stem.split('_')[-1]) + 1

    with latest.open("r") as f:
        state = json.load(f)
        last_crawled_id = state['last_id']
        total_time = state['total_time']
        last_yymm, last_id_num = base_id_to_tuple(last_crawled_id)

        if last_yymm == end_yymm and last_id_num >= end_id:
            print("Crawling already completed up to the end ID.")
            return None, None, None, None, None

        start_yymm, id_num = last_yymm, last_id_num + 1

    return start_yymm, id_num, checkpoint_id, total_time, last_yymm


In [544]:
def build_id_list(start_yymm: int, start_id: int, end_yymm: int, end_id: int):
    """
    T·∫°o danh s√°ch ID arXiv c·∫ßn crawl trong kho·∫£ng t·ª´ start ƒë·∫øn end.
    """
    id_list = []
    yymm = start_yymm

    if start_yymm == end_yymm:
        id_list = [f"{yymm}.{i:05d}" for i in range(start_id, end_id + 1)]
    else:
        while yymm <= end_yymm:
            if yymm == start_yymm:
                id_list.extend([f"{yymm}.{i:05d}" for i in range(start_id, 30001)])
            elif yymm == end_yymm:
                id_list.extend([f"{yymm}.{i:05d}" for i in range(1, end_id + 1)])
            else:
                id_list.extend([f"{yymm}.{i:05d}" for i in range(1, 30001)])
            if yymm % 100 == 12:
                yymm += 88  # sang nƒÉm m·ªõi
            yymm += 1
            start_id = 1
    return id_list


In [545]:
def save_checkpoint(checkpoint_folder: Path,
                    checkpoint_id: int,
                    last_id: str,
                    section_time: float,
                    total_time: float):
    """
    Ghi checkpoint an to√†n cho m·ªói batch.
    """
    new_state = {
        'last_id': last_id,
        'section_time': section_time,
        'total_time': total_time
    }
    save_state(checkpoint_folder / f'metadata_checkpoint_{checkpoint_id}', new_state)
    print(f"Saved checkpoint {checkpoint_id}.")


In [546]:
def crawl_arXiv_metadata(start_month: str, start_id: int,
                         end_month: str, end_id: int,
                         batch_size: int = 100,
                         delay: float = 0.125,
                         query_delay: float = 3,
                         retries: int = 3,
                         folder: Optional[Path] = None):
    """Crawl metadata arXiv using arxiv.py by ID, for all version (1 file per paper ID)."""

    start_yymm = int(month_to_yymm(start_month))
    end_yymm = int(month_to_yymm(end_month))

    # Load checkpoint
    resume = load_checkpoint_state(folder, end_yymm, end_id)
    if resume == (None, None, None, None, None):
        return
    if resume[0] is not None:
        start_yymm, id_num, checkpoint_id, total_time, _ = resume
    else:
        id_num, checkpoint_id, total_time = start_id, 1, 0

    # T·∫°o danh s√°ch ID c·∫ßn crawl
    id_list = build_id_list(start_yymm, id_num, end_yymm, end_id)
    client = arxiv.Client(page_size=batch_size, delay_seconds=delay, num_retries=retries)

    curr_batch_start = 0
    yymm = start_yymm

    while curr_batch_start < len(id_list):
        batch_ids = id_list[curr_batch_start: curr_batch_start + batch_size]
        curr_batch_start += batch_size

        search = arxiv.Search(id_list=batch_ids)
        successful_crawls = 0
        time_start = time.time()

        # Crawl song song metadata t·ª´ng b√†i
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
            futures = []
            for result in client.results(search):
                # üîß S·ª≠a ·ªü ƒë√¢y: ch·ªâ t·∫°o th∆∞ m·ª•c ID g·ªëc, kh√¥ng t·∫°o theo version
                base_id = result.get_short_id().split('v')[0]
                meta_folder = folder / base_id.replace(".", "-")
                meta_folder.mkdir(parents=True, exist_ok=True)

                # Ghi duy nh·∫•t 1 file metadata.json
                meta_path = meta_folder / "metadata.json"

                # N·∫øu file ƒë√£ t·ªìn t·∫°i, b·ªè qua (tr√°nh ghi ƒë√®)
                if meta_path.exists():
                    print(f"Metadata exists for {base_id}, skipping.")
                    continue

                # G·ª≠i nhi·ªám v·ª• t·ªõi worker (fetch_metadata_worker)
                futures.append(
                    ex.submit(fetch_metadata_worker, result, client, meta_folder, query_delay)
                )

            for fut in futures:
                try:
                    successful_crawls += fut.result()
                except Exception as e:
                    print(f"Metadata fetch failed: {e}")

        time_spent = time.time() - time_start

        # L∆∞u checkpoint n·∫øu batch th√†nh c√¥ng
        if successful_crawls == len(batch_ids):
            total_time += time_spent
            save_checkpoint(WORK_DIR, checkpoint_id, batch_ids[-1], time_spent, total_time)
            checkpoint_id += 1

        elif successful_crawls == 0 and yymm < end_yymm:
            print("Metadata list is empty. Moving to next month...")
            curr_batch_start = 0
            if yymm % 100 == 12:
                yymm += 88
            yymm += 1
            id_list = id_list[id_list.index(f"{yymm}.00001"):]


In [547]:
def get_version_list_from_metadata(metadata_dir: Path):
    """ƒê·ªçc to√†n b·ªô metadata.json trong c√°c th∆∞ m·ª•c con ƒë·ªÉ thu ƒë∆∞·ª£c danh s√°ch arXiv version IDs."""
    version_list = []

    for subdir in metadata_dir.iterdir():
        if subdir.is_dir():
            meta_file = subdir / "metadata.json"
            if meta_file.exists():
                with open(meta_file, "r", encoding="utf-8") as f:
                    data = json.load(f)
                arxiv_id = data.get("id", "")
                if arxiv_id:
                    # N·∫øu b√†i c√≥ nhi·ªÅu phi√™n b·∫£n -> th√™m ƒë·ªß c√°c version
                    if "revised_dates" in data:
                        n_ver = len(data["revised_dates"])
                        for v in range(1, n_ver + 1):
                            version_list.append(f"{arxiv_id}v{v}")
                    else:
                        version_list.append(f"{arxiv_id}v1")

    print(f"Total versions to download: {len(version_list)}")
    return version_list


## **Crawl paper**

In [548]:
def is_gzip_valid(path: Path):
    """
    Ki·ªÉm tra file .gz c√≥ h·ª£p l·ªá kh√¥ng.
    """
    if not path.exists() or path.stat().st_size == 0:
        return False

    try:
        # Check GZIP integrity
        with gzip.open(str(path), "rb") as gz:
            while gz.read(1024 * 1024):
                pass

        return True
    except (IOError, EOFError, gzip.BadGzipFile):
        return False

In [549]:
def is_tar_gz_valid(path: Path):
    """
    Ki·ªÉm tra file .tar.gz c√≥ h·ª£p l·ªá kh√¥ng.
    """
    if not path.exists() or path.stat().st_size == 0:
        return False

    if not tarfile.is_tarfile(str(path)):
        return False
    if not is_gzip_valid(path):
        return False
    return True

In [550]:
def detect_with_filetype(path: Path):
    kind = filetype.guess(path)
    if kind:
        return kind.extension, f"{kind.mime} (by filetype)"
    return None, "unknown"

In [551]:
def fetch_paper_source_worker(result: arxiv.Result,
                              client: arxiv.Client,
                              output_folder: Path,
                              papers_dir: Path,
                              query_delay: float,
                              failed_list: List[str]):
    curr_id = result.get_short_id()

    # Ki·ªÉm tra file ƒë√£ t·∫£i v√† h·ª£p l·ªá
    tar_path = output_folder / f"{curr_id}.tar.gz"
    if tar_path.exists() and is_tar_gz_valid(tar_path):
        print(f"{curr_id}.tar.gz is already downloaded.")
        return

    # N·∫øu t·ªìn t·∫°i nh∆∞ng h·ªèng th√¨ x√≥a
    elif tar_path.exists():
        os.remove(tar_path)

    # Ki·ªÉm tra file c√≥ ph·∫ßn m·ªü r·ªông kh√°c
    else:
        try:
            actual_file_name = list(output_folder.glob(f"{curr_id}.*"))[0]
            print(f"{actual_file_name.name} is already downloaded.")
            return
        except IndexError:
            pass

    # Ti·∫øn h√†nh t·∫£i
    try:
        print(f"Start downloading source for arXiv ID {curr_id}.")
        result.download_source(str(output_folder), filename=f"{curr_id}.tar.gz")

        # X√°c ƒë·ªãnh lo·∫°i file th·∫≠t s·ª±
        tar_full_path = papers_dir / f"{curr_id}.tar.gz"
        suitable_ext, _ = detect_with_filetype(tar_full_path)

        if suitable_ext and suitable_ext != "gz":
            os.rename(tar_full_path, papers_dir / f"{curr_id}.{suitable_ext}")

        elif not is_tar_gz_valid(tar_full_path):
            os.rename(tar_full_path, papers_dir / f"{curr_id}.gz")

        print(f"Finished downloading source for arXiv ID {curr_id}.")
        time.sleep(query_delay)

    except Exception as e:
        print(f"Failed to download {curr_id} (.tar.gz). Error: {e}")
        if curr_id not in failed_list:
            failed_list.append(curr_id)

In [552]:
def crawl_paper_source(version_list: List[str],
                       output_folder: Path,
                       batch_size: int = 100,
                       delay: float = 0.125,
                       query_delay: float = 3,
                       retries: int = 3):
    
    print(f"Total versions to download after filtering existing valid files: {len(version_list)}")

    failed_list = []
    client = arxiv.Client(page_size=batch_size, delay_seconds=delay, num_retries=retries)
    papers_dir = output_folder  # ho·∫∑c Path("../bulk") n·∫øu b·∫°n mu·ªën ƒë·ªÉ ri√™ng

    curr_batch_start = 0
    while curr_batch_start < len(version_list):
        batch_ids = version_list[curr_batch_start: curr_batch_start + batch_size]
        curr_batch_start += batch_size
        print(f"Downloading batch starting at {batch_ids[0]}...")

        search = arxiv.Search(id_list=batch_ids)

        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
            futures = [
                ex.submit(
                    fetch_paper_source_worker,
                    result,
                    client,
                    output_folder,
                    papers_dir,
                    query_delay,
                    failed_list
                )
                for result in client.results(search)
            ]

            # ƒë·ª£i t·∫•t c·∫£ ho√†n th√†nh
            for fut in futures:
                fut.result()

    return failed_list

In [553]:
#crawl_paper_source(version_list, papers_dir)

In [554]:

successful_downloads = []
failed_downloads = []

for ver in version_list:
    try:
        # L·∫•y t·∫•t c·∫£ c√°c file c√≥ c√πng ID
        candidates = list(papers_dir.glob(f"{ver}.*"))
        if not candidates:
            print(f"No file found for {ver}")
            failed_downloads.append(ver)
            continue

        found_success = False
        for f in candidates:
            name = f.name.lower()

            # ---- Tr∆∞·ªùng h·ª£p 1: tar.gz ----
            if name.endswith(".tar.gz"):
                if is_tar_gz_valid(f):
                    print(f"Valid tar.gz: {f.name}")
                    successful_downloads.append(ver)
                    found_success = True
                    break
                else:
                    print(f"Corrupted tar.gz: {f.name}")

            # ---- Tr∆∞·ªùng h·ª£p 2: gz ----
            elif name.endswith(".gz"):
                if is_gzip_valid(f):
                    print(f"Valid gzip: {f.name}")
                    successful_downloads.append(ver)
                    found_success = True
                    break
                else:
                    print(f"Corrupted gzip: {f.name}")

            # ---- Tr∆∞·ªùng h·ª£p 3: pdf ----
            elif name.endswith(".pdf"):
                print(f"Found PDF: {f.name}")
                successful_downloads.append(ver)
                found_success = True
                break

        if not found_success:
            print(f"‚ùå All files for {ver} invalid or unreadable.")
            failed_downloads.append(ver)

    except Exception as e:
        print(f"‚ö†Ô∏è Error while checking {ver}: {e}")
        failed_downloads.append(ver)

print("\nSummary:")
print(f"{len(successful_downloads)} successful")
print(f"{len(failed_downloads)} failed")



No file found for 2211.13747v1
No file found for 2211.13748v1
No file found for 2211.13749v1
No file found for 2211.13750v1
No file found for 2211.13750v2
No file found for 2211.13751v1
No file found for 2211.13752v1
No file found for 2211.13753v1
No file found for 2211.13754v1
No file found for 2211.13755v1
No file found for 2211.13755v2

Summary:
0 successful
11 failed


In [555]:
with open('failed_downloaded_ids.txt', 'w') as f:
    for id in failed_downloads:
        f.write(id + '\n')

In [556]:
len(successful_downloads)

0

In [557]:
successful_downloads[:5]

[]

In [558]:
#crawl_paper_source(failed_downloads, ".." / papers_dir)

In [559]:
original_ids = []
for ver in version_list:
    if ver.find('v1') > -1:
        original_ids.append(ver[:ver.find('v1')])

len(original_ids)

9

In [560]:
actual_download_list = [str(file) for file in list((papers_dir).glob("*.*"))]

len(actual_download_list)

0

In [561]:
gz_list = [str(file) for file in list((papers_dir).glob("*.gz"))]
pdf_list = [str(file) for file in list((papers_dir).glob("*.pdf"))]
tar_gz_list = []
non_tar_gz_list = []
for file in gz_list:
    if file.find('.tar') > -1:
        tar_gz_list.append(file)
    else:
        non_tar_gz_list.append(file)

In [562]:
print(len(pdf_list))
print(len(tar_gz_list))
print(len(non_tar_gz_list))

0
0
0


In [563]:
actual_download_ids = ['.'.join(file.split('/')[-1].split('.')[:2]) for file in actual_download_list]
failed_download_ids = []
for id in version_list:
    if id not in actual_download_ids:
        failed_download_ids.append(id)

len(failed_download_ids)

11

In [564]:
#crawl_paper_source(failed_download_ids, papers_dir)

In [565]:
def generate_metadata_status(
    metadata_dir: Path,
    version_list: list,
    actual_download_list: list,
    papers_dir: Path,
    output_dir: Path):
    metadata_status = []

    # Duy·ªát to√†n b·ªô c√°c file metadata.json n·∫±m trong c√°c th∆∞ m·ª•c con
    for meta_file in metadata_dir.rglob("metadata.json"):
        try:
            with open(meta_file, "r", encoding="utf-8") as f:
                metadata = json.load(f)

            arxiv_id = metadata.get("id", None)
            if not arxiv_id:
                continue

            # Ki·ªÉm tra xem file paper t∆∞∆°ng ·ª©ng c√≥ t·ªìn t·∫°i kh√¥ng
            matched = [file for file in actual_download_list if arxiv_id in file]
            if matched:
                metadata["paper_status"] = "available"
            else:
                metadata["paper_status"] = "missing"

            metadata_status.append(metadata)

        except Exception as e:
            print(f"‚ö†Ô∏è L·ªói khi ƒë·ªçc {meta_file}: {e}")
            continue

    # ƒê·∫£m b·∫£o th∆∞ m·ª•c ƒë√≠ch t·ªìn t·∫°i
    output_dir.mkdir(parents=True, exist_ok=True)
    output_path = output_dir / "metadata_status.json"

    # Ghi file k·∫øt qu·∫£
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(metadata_status, f, indent=2, ensure_ascii=False)

    print(f"File metadata_status.json ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: {output_path}")



#generate_metadata_status(metadatas_dir, version_list, actual_download_list, papers_dir, WORK_DIR)

## **Extraction**

In [566]:
F_TEXT   = 0x01
F_HCRC   = 0x02
F_COMMENT= 0x10
_FORBIDDEN = set('/\0')  # we treat tar paths as POSIX; '/' splits components
_CTRL_RANGES = [(0x00, 0x1F), (0x7F, 0x9F)]
_SURR_MIN, _SURR_MAX = 0xD800, 0xDFFF
_MAX_COMP_LEN = 200  # keep some headroom under 255

In [567]:
def _is_control(ch: str) -> bool:
    cp = ord(ch)
    return any(lo <= cp <= hi for lo, hi in _CTRL_RANGES)

In [568]:
def _sanitize_component(comp: str) -> str:
    # Normalize to NFC
    comp = unicodedata.normalize('NFC', comp)

    # Remove path separators and NULs early
    comp = ''.join('_' if c in _FORBIDDEN else c for c in comp)

    # Drop/control-map and surrogate-map
    cleaned = []
    for c in comp:
        o = ord(c)
        if _is_control(c):
            cleaned.append('_')
        elif _SURR_MIN <= o <= _SURR_MAX:
            cleaned.append('_')  # unpaired surrogate ‚Üí safe underscore
        else:
            cleaned.append(c)
    comp = ''.join(cleaned).strip()

    # Collapse empty/dots
    if comp in ('', '.', '..'):
        comp = '_'

    # Avoid leading '-' (we might pass names to tools later)
    if comp.startswith('-'):
        comp = '_' + comp[1:]

    # Trim very long components while trying to keep extension
    if len(comp) > _MAX_COMP_LEN:
        stem, dot, ext = comp.partition('.')
        if dot:  # has an extension
            stem = stem[:_MAX_COMP_LEN - 1 - len(ext)]
            comp = f"{stem}.{ext}"
        else:
            comp = comp[:_MAX_COMP_LEN]

    return comp

In [569]:
def _sanitize_relative_path(posix_path: str) -> str:
    # Split on POSIX '/', sanitize each piece, then rejoin with os.sep-neutral join below
    parts = [p for p in posix_path.split('/') if p not in ('',)]
    san = [_sanitize_component(p) for p in parts]
    # Prevent accidental absolute/parent after sanitization
    san = ['_' if p in ('', '.', '..') else p for p in san]
    clean_rel = '/'.join(san)
    if clean_rel != posix_path:
        print(f"[sanitize] {posix_path!r} -> {clean_rel!r}")
    return clean_rel

In [570]:
def _dedupe_path(target: Path, original_hint: str) -> Path:
    """If 'target' exists, append a short hash derived from the original name."""
    if not target.exists():
        return target
    stem = target.stem
    suffix = target.suffix
    h = hashlib.sha1(original_hint.encode('utf-8', 'surrogatepass')).hexdigest()[:8]
    cand = target.with_name(f"{stem}__{h}{suffix}")
    i = 1
    while cand.exists():
        cand = target.with_name(f"{stem}__{h}_{i}{suffix}")
        i += 1
    return cand

In [571]:
def _read_gzip_original_name(path: Path) -> Optional[str]:
    """
    Parse the gzip header to recover the original filename (FNAME) if present.
    Returns a sanitized base name (no directories). If absent, returns None.
    """
    with path.open("rb") as f:
        data = f.read(10)  # ID1 ID2 CM FLG MTIME(4) XFL OS
        if len(data) < 10:
            return None
        # GZIP_MAGIC: b"\x1f\x8b"
        if data[0:2] != b"\x1f\x8b":
            return None

        cm = data[2]
        flg = data[3]
        # F_RESERVED: 0xE0
        if cm != 8 or (flg & 0xE0):
            return None

        # After the fixed 10-byte header:
        # Optional sections in order: EXTRA, NAME, COMMENT, HCRC
        def _skip(n: int):
            f.seek(n, io.SEEK_CUR)

        # EXTRA
        # F_EXTRA: 0x04
        if flg & 0x04:
            xtra_len_bytes = f.read(2)
            if len(xtra_len_bytes) != 2:
                return None
            xlen = struct.unpack("<H", xtra_len_bytes)[0]
            _skip(xlen)

        # NAME (zero-terminated)
        original_name = None
        # F_NAME: 0x08
        if flg & 0x08:
            name_bytes = bytearray()
            while True:
                b = f.read(1)
                if not b:
                    # Unexpected EOF
                    break
                if b == b"\x00":
                    break
                name_bytes.extend(b)
            try:
                raw = name_bytes.decode("latin-1", errors="replace")
                # keep only basename and sanitize for the filesystem
                base = os.path.basename(raw)
                original_name = _sanitize_component(base)
            except Exception:
                original_name = None

        # We don't need COMMENT or HCRC for name extraction

        if original_name:
            # Return only the final component to avoid path traversal
            return os.path.basename(original_name)

        return None

In [572]:
def _safe_join(base: Path, *paths: str) -> Path:
    """
    Join paths and ensure the result stays within base.
    Prevents path traversal attacks.
    """
    base = base.resolve()
    final = base
    for p in paths:
        final = final / str(p)
    final = final.resolve()
    if not (str(final) == str(base) or str(final).startswith(str(base) + os.sep)):
        raise ValueError(f"Blocked path traversal attempt: {final}")
    return final

In [573]:
def _safe_extract_tar(tar: tarfile.TarFile, dest: Path, members: Optional[Iterable[tarfile.TarInfo]] = None):
    """
    Secure tar extraction: ensure no member escapes dest and filenames are macOS-safe.
    Skips symlinks and special files.
    """
    for m in (members or tar):
        # Skip unsafe types
        if m.islnk() or m.issym() or m.ischr() or m.isblk() or m.isfifo():
            continue

        # Sanitize relative path (tar uses POSIX '/')
        clean_rel = _sanitize_relative_path(m.name)
        if clean_rel == '':
            continue

        # Join & ensure containment
        target_path = _safe_join(dest, clean_rel)

        # De-duplicate on collision
        target_path = _dedupe_path(target_path, m.name)

        if m.isdir():
            target_path.mkdir(parents=True, exist_ok=True)
            # best-effort permissions on dirs
            try:
                os.chmod(target_path, m.mode)
            except Exception:
                pass
            continue

        # Ensure parent exists
        target_path.parent.mkdir(parents=True, exist_ok=True)

        # Extract regular file
        src = tar.extractfile(m)
        if src is None:
            continue  # skip specials with no extractable content
        with src, open(target_path, "wb") as out:
            # Stream copy in chunks
            while True:
                chunk = src.read(1024 * 1024)
                if not chunk:
                    break
                out.write(chunk)

        # Preserve permissions (best-effort)
        try:
            os.chmod(target_path, m.mode)
        except Exception:
            pass

In [574]:
def extract_tar_archive(path: Path, out_dir: Path) -> list[Path]:
    """
    Gi·∫£i n√©n m·ªçi lo·∫°i tar archive (tar, tar.gz, tar.xz, v.v.)
    v√†o th∆∞ m·ª•c out_dir. ƒê·∫£m b·∫£o an to√†n t√™n file, kh√¥ng path traversal.
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    extracted: list[Path] = []

    with tarfile.open(str(path), mode="r:*") as tar:
        members = tar.getmembers()
        _safe_extract_tar(tar, out_dir, members)

        for m in members:
            clean_rel = _sanitize_relative_path(m.name)
            if not clean_rel:
                continue
            target = (out_dir / clean_rel).resolve()
            if target.exists():
                extracted.append(target)

    return extracted

In [575]:
def extract_plain_gzip(path: Path, out_dir: Path) -> Path:
    """
    Gi·∫£i n√©n file GZIP (.gz) ƒë∆°n l·∫ª v√†o th∆∞ m·ª•c out_dir.
    N·∫øu file l√† .tar.gz, k·∫øt qu·∫£ v·∫´n gi·ªØ d·∫°ng .tar (cho v√≤ng sau).
    """
    out_dir.mkdir(parents=True, exist_ok=True)

    header_name = _read_gzip_original_name(path)
    if header_name:
        out_name = header_name
    else:
        if path.name.lower().endswith(".gz") and len(path.name) > 3:
            out_name = path.name[:-3]
        else:
            out_name = path.name + ".out"

    out_name = _sanitize_component(os.path.basename(out_name)) or "_"
    out_path = _safe_join(out_dir, out_name)
    out_path = _dedupe_path(out_path, out_name)

    with gzip.open(path, "rb") as gz, open(out_path, "wb") as out:
        for chunk in iter(lambda: gz.read(1024 * 1024), b""):
            out.write(chunk)

    return out_path

In [576]:
def extract_until_done(path: Path, out_dir: Path, max_depth: int = 5):
    """
    ƒê·ªá quy gi·∫£i n√©n file nhi·ªÅu l·ªõp (tar, tar.gz, tgz, .gz...),
    lu√¥n gi·∫£i n√©n v√†o th∆∞ m·ª•c out_dir (kh√¥ng ·ªü c·∫°nh file g·ªëc).
    """
    extracted_files = []
    current_files = [path]
    depth = 0

    while current_files and depth < max_depth:
        next_files = []
        for f in current_files:
            try:
                # ---- N·∫øu l√† TAR ----
                if tarfile.is_tarfile(f):
                    print(f"[{depth}] Extracting TAR archive: {f}")
                    new_files = extract_tar_archive(f, out_dir)
                    extracted_files.extend(new_files)
                    next_files.extend(new_files)
                    continue

                # ---- N·∫øu l√† GZIP ----
                elif f.suffix == ".gz":
                    print(f"[{depth}] Extracting GZIP file: {f}")
                    new_file = extract_plain_gzip(f, out_dir)
                    extracted_files.append(new_file)
                    next_files.append(new_file)
                    continue

                # ---- Kh√¥ng gi·∫£i ƒë∆∞·ª£c (b·ªè qua) ----
                else:
                    continue

            except Exception as e:
                print(f"Kh√¥ng th·ªÉ gi·∫£i n√©n {f.name}: {e}")
                continue

        # L·ªçc file m·ªõi sinh ra ƒë·ªÉ xem c√≥ c·∫ßn gi·∫£i ti·∫øp kh√¥ng
        current_files = [nf for nf in next_files if nf.suffix in [".gz", ".xz", ".tar", ".tgz"]]
        depth += 1

    if depth >= max_depth:
        print("D·ª´ng l·∫°i do ƒë·∫°t gi·ªõi h·∫°n ƒë·ªô s√¢u (max_depth).")
    else:
        print("Ho√†n t·∫•t: kh√¥ng c√≤n file n√©n n√†o ƒë·ªÉ gi·∫£i.")

    return extracted_files


## **Delete**

In [577]:
def comment_include_graphics(tex_path: Path):
    """Th√™m comment tr∆∞·ªõc c√°c d√≤ng includegraphics ƒë·ªÉ tr√°nh l·ªói khi compile."""
    if not tex_path.is_file():
        return
    with open(tex_path, 'r', encoding='latin-1') as f:
        tex_source = f.readlines()

    graphics_count = 0
    for i, line in enumerate(tex_source):
        if '\\includegraphics' in line and '%' not in line:
            tex_source[i] = line[:line.find('\\includegraphics')] + '%' + line[line.find('\\includegraphics'):]
            graphics_count += 1

    if graphics_count > 0:
        print(f"Commented {graphics_count} includegraphics in {tex_path.name}")

    with open(tex_path, 'w', encoding='latin-1') as f:
        f.writelines(tex_source)

In [578]:
def delete_image_files(root: Path):
    """X√≥a to√†n b·ªô file h√¨nh ·∫£nh trong th∆∞ m·ª•c."""
    exts = {".png", ".jpg", ".jpeg", ".pdf", ".eps", ".gif", ".tif", ".tiff", ".bmp", ".svg"}
    count = 0
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in exts:
            try:
                p.unlink()
                count += 1
            except Exception:
                pass
    if count:
        print(f"Removed {count} image files from {root.name}")

In [579]:
def delete_non_tex_files(root: Path, keep_exts=None):
    """Gi·ªØ l·∫°i .tex v√† .bib, x√≥a to√†n b·ªô file c√≤n l·∫°i."""
    if keep_exts is None:
        keep_exts = {".tex", ".bib"}

    if not root.exists():
        print(f"‚ö†Ô∏è Th∆∞ m·ª•c {root} kh√¥ng t·ªìn t·∫°i.")
        return

    count_deleted, count_kept = 0, 0
    for p in root.rglob("*"):
        if p.is_file():
            ext = p.suffix.lower()
            if ext not in keep_exts:
                try:
                    p.unlink()
                    count_deleted += 1
                except Exception:
                    pass
            else:
                count_kept += 1

    print(f"D·ªçn d·∫πp '{root.name}': gi·ªØ {count_kept}, x√≥a {count_deleted} file kh√°c.")

In [580]:
def delete_empty_dirs(root: Path):
    """ƒê·ªá quy x√≥a t·∫•t c·∫£ th∆∞ m·ª•c r·ªóng."""
    count = 0
    for p in sorted(root.rglob("*"), reverse=True):
        if p.is_dir() and not any(p.iterdir()):
            try:
                p.rmdir()
                count += 1
            except Exception:
                pass
    if count:
        print(f"ƒê√£ x√≥a {count} th∆∞ m·ª•c r·ªóng trong '{root.name}'")

In [581]:
def clean_and_flatten_subdirs(paper_dir: Path):
    """
    D·ªçn d·∫πp th∆∞ m·ª•c paper sao cho:
    - N·∫øu th∆∞ m·ª•c con ch·ª©a file .tex/.bib KH√ÅC th∆∞ m·ª•c cha ‚Üí d·ªùi ra ngo√†i r·ªìi x√≥a subdir.
    - N·∫øu th∆∞ m·ª•c con ch·ª©a file .tex/.bib GI·ªêNG th∆∞ m·ª•c cha ‚Üí x√≥a subdir.
    - N·∫øu kh√¥ng c√≥ file .tex/.bib ‚Üí x√≥a subdir lu√¥n.
    """
    if not paper_dir.exists() or not paper_dir.is_dir():
        return

    tex_bib_exts = {".tex", ".bib"}
    moved, deleted = 0, 0
    parent_files = {p.name for p in paper_dir.glob("*") if p.suffix.lower() in tex_bib_exts}

    for subdir in [d for d in paper_dir.iterdir() if d.is_dir()]:
        sub_files = {p.name for p in subdir.rglob("*") if p.suffix.lower() in tex_bib_exts}

        # N·∫øu kh√¥ng c√≥ file .tex/.bib ‚Üí x√≥a lu√¥n
        if not sub_files:
            shutil.rmtree(subdir, ignore_errors=True)
            deleted += 1
            continue

        # N·∫øu tr√πng ho√†n to√†n ‚Üí x√≥a
        if sub_files == parent_files:
            shutil.rmtree(subdir, ignore_errors=True)
            deleted += 1
        else:
            # Kh√°c ‚Üí d·ªùi file ra ngo√†i r·ªìi x√≥a
            for f in subdir.rglob("*"):
                if f.is_file() and f.suffix.lower() in tex_bib_exts:
                    dest = paper_dir / f.name
                    if not dest.exists():
                        shutil.move(str(f), str(dest))
                        moved += 1
            shutil.rmtree(subdir, ignore_errors=True)
            deleted += 1

    print(f"D·ªçn {paper_dir.name}: di chuy·ªÉn {moved} file, x√≥a {deleted} th∆∞ m·ª•c con.")

In [582]:
'''
for file in actual_download_list:
    file_path = Path(file)
    id = '.'.join(file_path.stem.split('.')[:2])
    target_dir = (WORK_DIR / "extracted" / id).resolve()
    target_dir.mkdir(parents=True, exist_ok=True)

    print(f"\nDecompressing {file_path.name} ‚Üí {target_dir}")

    try:
        extract_until_done(file_path, target_dir, max_depth=5)
        print(f"Done extracting {file_path.name}")

        # üîπ L√†m ph·∫≥ng & gom file .tex/.bib h·ª£p l·ªá
        clean_and_flatten_subdirs(target_dir)

        # üîπ Comment d√≤ng includegraphics trong .tex
        for tex_file in target_dir.rglob("*.tex"):
            comment_include_graphics(tex_file)

        # üîπ X√≥a ·∫£nh, file th·ª´a, th∆∞ m·ª•c r·ªóng
        delete_image_files(target_dir)
        delete_non_tex_files(target_dir)
        delete_empty_dirs(target_dir)

        print(f"Ho√†n t·∫•t d·ªçn th∆∞ m·ª•c: {target_dir.name}")

    except Exception as e:
        print(f"L·ªói khi gi·∫£i n√©n {file_path.name}: {e}")
        try:
            shutil.copy2(file_path, target_dir)
            print(f"Copied {file_path.name} v√†o {target_dir}")
        except Exception as copy_err:
            print(f"Kh√¥ng th·ªÉ copy {file_path.name}: {copy_err}")
'''

'\nfor file in actual_download_list:\n    file_path = Path(file)\n    id = \'.\'.join(file_path.stem.split(\'.\')[:2])\n    target_dir = (WORK_DIR / "extracted" / id).resolve()\n    target_dir.mkdir(parents=True, exist_ok=True)\n\n    print(f"\nDecompressing {file_path.name} ‚Üí {target_dir}")\n\n    try:\n        extract_until_done(file_path, target_dir, max_depth=5)\n        print(f"Done extracting {file_path.name}")\n\n        # üîπ L√†m ph·∫≥ng & gom file .tex/.bib h·ª£p l·ªá\n        clean_and_flatten_subdirs(target_dir)\n\n        # üîπ Comment d√≤ng includegraphics trong .tex\n        for tex_file in target_dir.rglob("*.tex"):\n            comment_include_graphics(tex_file)\n\n        # üîπ X√≥a ·∫£nh, file th·ª´a, th∆∞ m·ª•c r·ªóng\n        delete_image_files(target_dir)\n        delete_non_tex_files(target_dir)\n        delete_empty_dirs(target_dir)\n\n        print(f"Ho√†n t·∫•t d·ªçn th∆∞ m·ª•c: {target_dir.name}")\n\n    except Exception as e:\n        print(f"L·ªói khi 

## **Multi-thread**

### **4 lu·ªìng song song**

In [583]:
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
from threading import Thread
import shutil, time

def download_version(arxiv_id, client, output_dir, query_delay):
    #T·∫£i 1 version paper t·ª´ arXiv v√† tr·∫£ v·ªÅ ƒë∆∞·ªùng d·∫´n file .tar.gz
    try:
        search = arxiv.Search(id_list=[arxiv_id])
        for result in client.results(search):
            out_path = output_dir / f"{arxiv_id}.tar.gz"
            result.download_source(str(output_dir), filename=f"{arxiv_id}.tar.gz")
            time.sleep(query_delay)
            return out_path
    except Exception as e:
        print(f"Failed to download {arxiv_id}: {e}")
        return None


def extract_and_clean(file_path: Path, work_root: Path):
    #Gi·∫£i n√©n, l·ªçc gi·ªØ l·∫°i .tex/.bib, x√≥a r√°c & file n√©n
    try:
        # T·∫°o th∆∞ m·ª•c ƒë√≠ch: papers/<id_version>/
        id_version = file_path.name.replace(".tar.gz", "").replace(".gz", "").replace(".tar", "")
        target_dir = work_root / "paper" / id_version
        target_dir.mkdir(parents=True, exist_ok=True)

        extract_until_done(file_path, target_dir)
        clean_and_flatten_subdirs(target_dir)
        delete_image_files(target_dir)
        delete_non_tex_files(target_dir)
        delete_empty_dirs(target_dir)

        file_path.unlink(missing_ok=True)  # x√≥a file n√©n g·ªëc
        print(f"{id_version}: Done extract + clean")

    except Exception as e:
        print(f"{file_path.name}: {e}")


def worker_pipeline(queue, client, tmp_dir, work_root, query_delay):
    #Thread worker ch√≠nh: t·∫£i + gi·∫£i n√©n + d·ªçn d·∫πp
    while True:
        arxiv_id = queue.get()
        if arxiv_id is None:  # t√≠n hi·ªáu k·∫øt th√∫c
            break
        file_path = download_version(arxiv_id, client, tmp_dir, query_delay)
        if file_path and file_path.exists():
            extract_and_clean(file_path, work_root)
        queue.task_done()


def run_concurrent_pipeline(version_list, work_root, max_workers=4):
    #Kh·ªüi ƒë·ªông pipeline ƒëa lu·ªìng cho to√†n b·ªô version
    tmp_dir = work_root / "tmp_downloads"
    tmp_dir.mkdir(parents=True, exist_ok=True)

    client = arxiv.Client(page_size=1, delay_seconds=0.2, num_retries=3)

    # T·∫°o h√†ng ƒë·ª£i v√† kh·ªüi t·∫°o thread pool
    queue = Queue()
    threads = []
    for _ in range(max_workers):
        t = Thread(target=worker_pipeline, args=(queue, client, tmp_dir, work_root, 1.5))
        t.start()
        threads.append(t)

    # Th√™m t·∫•t c·∫£ version v√†o h√†ng ƒë·ª£i
    for vid in version_list:
        queue.put(vid)

    queue.join()  # ƒë·ª£i to√†n b·ªô task xong
    for _ in range(max_workers):
        queue.put(None)
    for t in threads:
        t.join()

    print("All papers processed successfully!")
"""


'\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\nfrom queue import Queue\nfrom threading import Thread\nimport shutil, time\n\ndef download_version(arxiv_id, client, output_dir, query_delay):\n    #T·∫£i 1 version paper t·ª´ arXiv v√† tr·∫£ v·ªÅ ƒë∆∞·ªùng d·∫´n file .tar.gz\n    try:\n        search = arxiv.Search(id_list=[arxiv_id])\n        for result in client.results(search):\n            out_path = output_dir / f"{arxiv_id}.tar.gz"\n            result.download_source(str(output_dir), filename=f"{arxiv_id}.tar.gz")\n            time.sleep(query_delay)\n            return out_path\n    except Exception as e:\n        print(f"Failed to download {arxiv_id}: {e}")\n        return None\n\n\ndef extract_and_clean(file_path: Path, work_root: Path):\n    #Gi·∫£i n√©n, l·ªçc gi·ªØ l·∫°i .tex/.bib, x√≥a r√°c & file n√©n\n    try:\n        # T·∫°o th∆∞ m·ª•c ƒë√≠ch: papers/<id_version>/\n        id_version = file_path.name.replace(".tar.gz", "").replace(".gz", "").replace("

### **3 lu·ªìng 12 t·∫ßng song song**

In [584]:
from pathlib import Path
from threading import Thread
from queue import Queue
import shutil, time, arxiv

# ========= Helpers =========

def strip_compression_suffix(filename: str) -> str:
    # B·ªè m·ªçi ƒëu√¥i n√©n ph·ªï bi·∫øn: .tar.gz, .tgz, .gz, .tar, .xz, .zip
    for ext in [".tar.gz", ".tgz", ".tar.xz", ".txz", ".gz", ".xz", ".zip", ".tar"]:
        if filename.endswith(ext):
            return filename[: -len(ext)]
    return filename

# ========= Stage workers =========

def downloader_worker(q_ids: Queue,
                      q_to_extract: Queue,
                      client: arxiv.Client,
                      tmp_dir: Path,
                      query_delay: float = 1.5):
    """
    Nh·∫≠n arxiv_id_version (vd '2301.01234v2') -> t·∫£i .tar.gz v√†o tmp_dir
    -> ƒë∆∞a ƒë∆∞·ªùng d·∫´n file n√©n sang h√†ng ƒë·ª£i extract.
    """
    while True:
        arxiv_id = q_ids.get()
        if arxiv_id is None:
            q_ids.task_done()
            break
        try:
            search = arxiv.Search(id_list=[arxiv_id])
            out_path = tmp_dir / f"{arxiv_id}.tar.gz"
            for result in client.results(search):
                # t·∫£i v·ªÅ file n√©n (n·∫øu ƒë√£ t·ªìn t·∫°i th√¨ ghi ƒë√® cho ch·∫Øc ch·∫Øn)
                result.download_source(str(tmp_dir), filename=out_path.name)
                time.sleep(query_delay)
                break
            if out_path.exists():
                q_to_extract.put(out_path)
                print(f"[DL] {arxiv_id} -> {out_path.name}")
            else:
                print(f"[DL] {arxiv_id}: no archive written")
        except Exception as e:
            print(f"[DL] {arxiv_id}: {e}")
        finally:
            q_ids.task_done()


def extractor_worker(q_from_dl: Queue,
                     q_to_clean: Queue,
                     papers_root: Path):
    """
    Nh·∫≠n file .tar.gz t·ª´ downloader -> gi·∫£i n√©n v√†o papers/<id_version>/
    -> ƒë·∫©y th∆∞ m·ª•c ƒë√≠ch sang h√†ng ƒë·ª£i clean -> x√≥a file n√©n.
    """
    while True:
        archive_path = q_from_dl.get()
        if archive_path is None:
            q_from_dl.task_done()
            break
        try:
            id_version = strip_compression_suffix(archive_path.name)
            target_dir = papers_root / id_version
            ensure_dir(target_dir)

            print(f"[EX] Extract {archive_path.name} ‚Üí {target_dir.name}")
            extract_until_done(archive_path, target_dir, max_depth=5)

            # chuy·ªÉn sang stage d·ªçn d·∫πp
            q_to_clean.put(target_dir)

            # x√≥a file n√©n ƒë·ªÉ ti·∫øt ki·ªám dung l∆∞·ª£ng
            try:
                archive_path.unlink(missing_ok=True)
            except Exception as e:
                print(f"[EX] cannot remove archive {archive_path}: {e}")

            print(f"[EX] {id_version}")
        except Exception as e:
            print(f"[EX] {archive_path.name}: {e}")
        finally:
            q_from_dl.task_done()


def cleaner_worker(q_from_extract: Queue):
    """
    Nh·∫≠n th∆∞ m·ª•c papers/<id_version>/ -> di chuy·ªÉn .tex/.bib l√™n c·∫•p tr√™n n·∫øu c·∫ßn,
    x√≥a ·∫£nh, x√≥a file r√°c, x√≥a th∆∞ m·ª•c r·ªóng.
    """
    while True:
        paper_dir = q_from_extract.get()
        if paper_dir is None:
            q_from_extract.task_done()
            break
        try:
            # l√†m ph·∫≥ng .tex/.bib t·ª´ subdir, x√≥a subdir tr√πng/d∆∞
            clean_and_flatten_subdirs(paper_dir)

            # comment \includegraphics ƒë·ªÉ tr√°nh l·ªói bi√™n d·ªãch
            for tex_file in paper_dir.rglob("*.tex"):
                comment_include_graphics(tex_file)

            # x√≥a ·∫£nh v√† c√°c file kh√¥ng ph·∫£i .tex/.bib
            delete_image_files(paper_dir)
            delete_non_tex_files(paper_dir)

            # d·ªçn th∆∞ m·ª•c r·ªóng c√≤n s√≥t
            delete_empty_dirs(paper_dir)

            print(f"[CL] cleaned {paper_dir.name}")
        except Exception as e:
            print(f"[CL] {paper_dir}: {e}")
        finally:
            q_from_extract.task_done()


# ========= Orchestrator =========

def run_three_stage_pipeline(version_list,
                             work_root: Path,
                             workers_per_stage: int = 4,
                             q_maxsize: int = 64,
                             client_delay: float = 0.2,
                             client_retries: int = 3):
    """
    Pipeline 3 t·∫ßng:
      Stage 1 (Download):  workers_per_stage threads
      Stage 2 (Extract):   workers_per_stage threads
      Stage 3 (Clean):     workers_per_stage threads
    H√†ng ƒë·ª£i c√≥ backpressure (q_maxsize) ƒë·ªÉ kh√¥ng ph√¨nh dung l∆∞·ª£ng t·∫°m.
    """

    # Th∆∞ m·ª•c g·ªëc
    tmp_dir    = work_root / "tmp_downloads"
    papers_dir = work_root / "paper"
    ensure_dir(tmp_dir)
    ensure_dir(papers_dir)

    # arxiv client d√πng chung (thread-safe ƒë·ªçc)
    client = arxiv.Client(page_size=1, delay_seconds=client_delay, num_retries=client_retries)

    # H√†ng ƒë·ª£i gi·ªØa c√°c stage (c√≥ gi·ªõi h·∫°n ƒë·ªÉ ch·ªëng ph√¨nh file t·∫°m)
    q_ids        = Queue(maxsize=q_maxsize)  # input: version_list
    q_to_extract = Queue(maxsize=q_maxsize)  # output c·ªßa DL -> input EX
    q_to_clean   = Queue(maxsize=q_maxsize)  # output c·ªßa EX -> input CL

    # Kh·ªüi t·∫°o worker c·ªßa t·ª´ng stage
    dl_threads = [
        Thread(target=downloader_worker, args=(q_ids, q_to_extract, client, tmp_dir), daemon=True)
        for _ in range(workers_per_stage)
    ]
    ex_threads = [
        Thread(target=extractor_worker, args=(q_to_extract, q_to_clean, papers_dir), daemon=True)
        for _ in range(workers_per_stage)
    ]
    cl_threads = [
        Thread(target=cleaner_worker, args=(q_to_clean,), daemon=True)
        for _ in range(workers_per_stage)
    ]

    # Start t·∫•t c·∫£ threads
    for t in dl_threads + ex_threads + cl_threads:
        t.start()

    # B∆°m to√†n b·ªô version v√†o h√†ng ƒë·ª£i ƒë·∫ßu v√†o
    for vid in version_list:
        q_ids.put(vid)

    # Ch·ªù stage 1 ho√†n t·∫•t
    q_ids.join()
    # G·ª≠i t√≠n hi·ªáu d·ª´ng cho stage 1
    for _ in dl_threads:
        q_ids.put(None)
    for t in dl_threads:
        t.join()

    # Ch·ªù stage 2 ho√†n t·∫•t
    q_to_extract.join()
    # G·ª≠i t√≠n hi·ªáu d·ª´ng cho stage 2
    for _ in ex_threads:
        q_to_extract.put(None)
    for t in ex_threads:
        t.join()

    # Ch·ªù stage 3 ho√†n t·∫•t
    q_to_clean.join()
    # G·ª≠i t√≠n hi·ªáu d·ª´ng cho stage 3
    for _ in cl_threads:
        q_to_clean.put(None)
    for t in cl_threads:
        t.join()

    print("Three-stage pipeline finished successfully!")


### **Ch·∫°y ch∆∞∆°ng tr√¨nh**

In [585]:
checkpoint_dir = metadatas_dir

crawl_arXiv_metadata(START_MONTH, START_ID, END_MONTH, END_ID,
                     folder=checkpoint_dir)

version_list = get_version_list_from_metadata(metadatas_dir)
print(f"Total versions to download: {len(version_list)}")

#run_concurrent_pipeline(
    #version_list=version_list,
    #work_root=WORK_DIR,
    #max_workers=4
#)

run_three_stage_pipeline(
    version_list=version_list,          # danh s√°ch nh∆∞ "2301.01234v2"
    work_root=WORK_DIR,                 # th∆∞ m·ª•c l√†m vi·ªác g·ªëc c·ªßa b·∫°n
    workers_per_stage=4,                # 4 lu·ªìng/t·∫ßng nh∆∞ b·∫°n y√™u c·∫ßu
    q_maxsize=64,                       # backpressure ch·ªëng ph√¨nh file t·∫°m
    client_delay=0.2,                   # arxiv.Client delay
    client_retries=3
)

Creating metadata for 2211.13747...
Creating metadata for 2211.13748...
Creating metadata for 2211.13749...
Creating metadata for 2211.13750...
Fetched metadata for arXiv ID 2211.13747
Creating metadata for 2211.13751...
Fetched metadata for arXiv ID 2211.13748Fetched metadata for arXiv ID 2211.13750

Creating metadata for 2211.13752...
Creating metadata for 2211.13753...
Fetched metadata for arXiv ID 2211.13749
Creating metadata for 2211.13754...
Fetched metadata for arXiv ID 2211.13751
Creating metadata for 2211.13755...
Fetched metadata for arXiv ID 2211.13752
Fetched metadata for arXiv ID 2211.13753
Fetched metadata for arXiv ID 2211.13754
Fetched metadata for arXiv ID 2211.13755
Saved checkpoint 1.
Total versions to download: 11
Total versions to download: 11
[DL] 2211.13749v1 -> 2211.13749v1.tar.gz
[EX] Extract 2211.13749v1.tar.gz ‚Üí 2211.13749v1
[0] Extracting GZIP file: workdir\tmp_downloads\2211.13749v1.tar.gz
Ho√†n t·∫•t: kh√¥ng c√≤n file n√©n n√†o ƒë·ªÉ gi·∫£i.
D·ªçn 2211.1