# <center> **Milestone 1 — Module 2: Reference**

## **Import libraries**

In [1]:
import json, time, signal, random, argparse, re, os, threading
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple
import requests
import psutil
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Config and constants**

In [2]:
# Version checkpoint
SCHEMA_VERSION = 1

# user agent for request HTTP
USER_AGENT = "arXiv-Refs-Fetcher/1.0 (+github.com/NguyenAn05)"

# regex check format arXiv ID
FOLDER_ID_RE = re.compile(r"^(\d{4})-(\d{5})$")
ARXIV_ID_RE = re.compile(r"^\d{4}\.\d{5}$")
ARXIV_WITH_VER_RE = re.compile(r"^(\d{4}\.\d{5})v\d+$")

REQUIRED_FIELDS = {"paper_title", "authors", "submission_date", "semantic_scholar_id"}

# Size each shard checkpoint
SHARD_SIZE = 100

REFS_ROOT = Path('/content/drive/MyDrive/[Project] Milestone 1/workdir/metadata and reference')
METAFILE = "metadata.json"
# Path folder checkpoint và file head
CHECKPOINTS_DIR = Path("/content/drive/MyDrive/[Project] Milestone 1/workdir/Checkpoints_reference")
HEAD_PATH = CHECKPOINTS_DIR / "checkpoint_head.json"
FAILED_JSONL_PATH = CHECKPOINTS_DIR / "failed.jsonl"

api_key = "RnqGZQ5DRD6BZaiLhMRq33qTEoH5hsPj6r5Q7z54"  # Replace with the actual API key

# Define headers with API key
headers = {"x-api-key": api_key}


## **Measuring memory**

Function `_proc_mem_bytes()` returns the resident memory (RSS) in bytes for the current process, optionally including child processes, falling back to `/proc/self/status` on Linux if `psutil` is unavailable.

In [None]:
# -----------------------
# Memory helpers
# -----------------------

def _proc_mem_bytes(include_children: bool = True) -> int:
    """
    Returns resident memory (RSS) in bytes for current process (+ children if requested).
    Falls back to /proc on Linux if psutil is unavailable.
    """
    if psutil is not None:
        proc = psutil.Process(os.getpid())
        rss = proc.memory_info().rss
        if include_children:
            for ch in proc.children(recursive=True):
                try:
                    rss += ch.memory_info().rss
                except Exception:
                    pass
        return rss

    # Fallback (Linux): read VmRSS from /proc/self/status
    try:
        with open("/proc/self/status") as f:
            for line in f:
                if line.startswith("VmRSS:"):
                    parts = line.split()
                    kb = int(parts[1])
                    return kb * 1024
    except Exception:
        pass
    return 0

Class `MemorySampler()` periodically samples RAM usage, storing `(t_seconds, rss_bytes)` while tracking peak bytes and a time-weighted average of resident memory.

In [None]:
class MemorySampler(threading.Thread):
    """
    Samples RAM usage over time. Stores (t_seconds, rss_bytes).
    Also keeps a precise time-weighted average (handles uneven sampling).
    """
    def __init__(self, interval: float = 0.5, include_children: bool = True):
        super().__init__(daemon=True)
        self.interval = interval
        self.include_children = include_children
        self.samples: List[Tuple[float, int]] = []
        self._stop_event = threading.Event()
        self.peak_bytes = 0

        # For time-weighted average
        self._last_t: Optional[float] = None
        self._last_v: Optional[int] = None
        self._area: float = 0.0  # integral of bytes over seconds

    def run(self):
        t0 = time.time()
        while not self._stop_event.is_set():
            now_abs = time.time()
            rss = _proc_mem_bytes(self.include_children)
            t_rel = now_abs - t0

            # update series & peak
            self.samples.append((t_rel, rss))
            if rss > self.peak_bytes:
                self.peak_bytes = rss

            # update time-weighted integral
            if self._last_t is not None and self._last_v is not None:
                dt = t_rel - self._last_t
                if dt > 0:
                    # rectangle rule with previous value (stable, avoids spikes)
                    self._area += self._last_v * dt

            self._last_t = t_rel
            self._last_v = rss

            time.sleep(self.interval)

        # one last flush to cover the tail between last sample and stop
        now_abs = time.time()
        if self._last_t is not None and self._last_v is not None and len(self.samples) > 0:
            t_rel = now_abs - (now_abs - (self.samples[-1][0]))  # same as last t_rel
            # nothing to add; sampling already accounted until last recorded time

    def stop(self):
        self._stop_event.set()

    def avg_bytes(self) -> int:
        """
        Returns time-weighted average bytes across the sampled duration.
        If no samples, returns 0.
        """
        if not self.samples:
            return 0
        total_time = self.samples[-1][0] - self.samples[0][0]
        if total_time <= 0:
            # fall back to simple arithmetic mean of values
            return int(sum(v for _, v in self.samples) / max(1, len(self.samples)))
        # Add the area from the last sample to the end as zero-length (already included)
        return int(self._area / total_time)

## **Disk helpers**

Function `_dir_size_bytes()` returns the total size in bytes of all non-symlink regular files under the directory `root`, handling missing or concurrently deleted files robustly.

In [None]:
# Disk helpers
def _dir_size_bytes(root: Path) -> int:
    """
    Sum of file sizes under root (non-symlink regular files).
    Robust to concurrent file deletes.
    """
    total = 0
    if not root.exists():
        return 0
    for p in root.rglob("*"):
        try:
            if p.is_file() and not p.is_symlink():
                total += p.stat().st_size
        except FileNotFoundError:
            continue
        except Exception:
            continue
    return total

Function `_sum_dirs_size()` returns the total size in bytes of all files contained in the given list of directories by summing `_dir_size_bytes` for each directory.

In [None]:
def _sum_dirs_size(dirs: List[Path]) -> int:
    return sum(_dir_size_bytes(Path(d)) for d in dirs)

Class `DiskWatcher()` periodically samples total disk usage (in bytes) across given directories, recording time series samples and tracking initial, final, and peak usage.

In [None]:
class DiskWatcher(threading.Thread):
    """
    Samples total bytes across specified directories over time.
    Tracks initial, final, and peak absolute usage.
    """
    def __init__(self, dirs: List[Path], interval: float = 1.0):
        super().__init__(daemon=True)
        self.dirs = [Path(d) for d in dirs]
        self.interval = interval
        self._stop_event = threading.Event()
        self.samples: List[Tuple[float, int]] = []
        self.initial_bytes: Optional[int] = None
        self.final_bytes: Optional[int] = None
        self.peak_bytes: int = 0

    def run(self):
        t0 = time.time()
        self.initial_bytes = _sum_dirs_size(self.dirs)
        cur = self.initial_bytes
        self.peak_bytes = cur
        while not self._stop_event.is_set():
            cur = _sum_dirs_size(self.dirs)
            t_rel = time.time() - t0
            self.samples.append((t_rel, cur))
            if cur > self.peak_bytes:
                self.peak_bytes = cur
            time.sleep(self.interval)

        # final measurement
        self.final_bytes = _sum_dirs_size(self.dirs)
        if self.final_bytes > self.peak_bytes:
            self.peak_bytes = self.final_bytes

    def stop(self):
        self._stop_event.set()

## **Processing directory tree metadata**

Function `folder_to_arxiv_id()` parses `folder_name` with `FOLDER_ID_RE` and returns the matched arXiv ID string, or `None` if it does not match.

In [None]:
# Function to Convert Folder Name to arXiv ID
def folder_to_arxiv_id(folder_name: str) -> Optional[str]:
    m = FOLDER_ID_RE.match(folder_name.strip())
    if not m:
        return None
    return f"{m.group(1)}.{m.group(2)}"

Function `normalize_id()` cleans and canonicalizes a raw arXiv ID string (handling dashes and optional version suffix), returning the normalized base ID or `None` if invalid.

In [None]:
# Function to Normalize an arXiv ID
def normalize_id(raw_id: str) -> Optional[str]:
    cleaned_id = raw_id.strip()
    cleaned_id = cleaned_id.replace("-", ".")

    match_with_version = ARXIV_WITH_VER_RE.match(cleaned_id)
    if match_with_version is not None:
        base_id = match_with_version.group(1)
    else:
        base_id = cleaned_id

    if ARXIV_ID_RE.match(base_id) is not None:
        return base_id
    else:
        return None


Function `read_id_from_metadata()` reads an arXiv ID from a `metadata.json` file at `meta_path`, normalizes it, and returns the ID or `None` if missing or invalid.

In [None]:
# Function to Read arXiv ID from metadata.json file
def read_id_from_metadata(meta_path: Path) -> Optional[str]:
    if not meta_path.exists():
        return None
    try:
        with meta_path.open("r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception:
        return None
    if isinstance(data, dict):
        rid = data.get("id")
        if isinstance(rid, str):
            return normalize_id(rid)
        return None
    if isinstance(data, list):
        for item in data:
            rid = (item or {}).get("id")
            if isinstance(rid, str):
                nid = normalize_id(rid)
                if nid:
                    return nid
    return None

Function `collect_ids_from_tree()` walks direct subdirectories of `root`, extracting unique normalized arXiv IDs from folder names or `metadata.json` files and returning them as a list.

In [None]:
# Function to Collect All arXiv IDs from a Directory Tree
def collect_ids_from_tree(root: Path) -> List[str]:
    ids: List[str] = []
    seen: Set[str] = set()
    for d in root.iterdir():
        if not d.is_dir():
            continue
        # Prioritize fetching/extracting from the folder name
        base = folder_to_arxiv_id(d.name)
        # Fallback by reading metadata.json in the directory if the folder name is invalid
        if not base:
            base = read_id_from_metadata(d / METAFILE)
        if base and base not in seen:
            seen.add(base)
            ids.append(base)
    return ids

## **Checkpoint processing**

Function `shard_id_for_index()` returns the shard ID for an item index `idx` by integer-dividing it by `SHARD_SIZE`.

In [None]:
# Function to Get Shard ID from Item Index
def shard_id_for_index(idx: int) -> int:
    return idx // SHARD_SIZE

Function `shard_path()` ensures the checkpoints directory exists and returns the filesystem path for the given `shard_id` as `cp_{shard_id:06d}.json`.

In [None]:

# Function to Get Shard Path from Shard ID
def shard_path(shard_id: int) -> Path:
    CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)
    return CHECKPOINTS_DIR / f"cp_{shard_id:06d}.json"

Function `atomic_write_json()` atomically writes JSON data to `path` by dumping to a temporary file and then replacing the target file.

In [6]:
# Atomically Write JSON File / Write JSON File Atomically
def atomic_write_json(path: Path, data: Any, indent: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_name(path.name + ".tmp")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=indent)
    tmp.replace(path)

Function `load_head()` loads the head checkpoint metadata from `HEAD_PATH`, returning a default schema dict if the file does not yet exist.

In [None]:
# Function to Load the Head Checkpoint Shard
def load_head() -> dict:
    if not HEAD_PATH.exists():
        return {
            "schema_version": SCHEMA_VERSION,
            "last_index": -1,
            "current_shard_id": 0,
            "stats": {"ok": 0, "fail": 0},
        }
    with HEAD_PATH.open("r", encoding="utf-8") as f:
        return json.load(f)

Function `save_head()` saves the head checkpoint metadata `head` to `HEAD_PATH` using `atomic_write_json`.


In [None]:
# Function to Save the Head Checkpoint Shard
def save_head(head: dict) -> None:
    atomic_write_json(HEAD_PATH, head)

Function `append_jsonl()` appends a single JSON-encoded object as one line to the file at `path`, creating parent directories if needed.

In [8]:
# Function to Write Objects to the File at Path (target for writing failed IDs)
def append_jsonl(path: Path, obj: dict) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

Function `init_shard()` creates and returns a new shard metadata dict with schema version, shard ID, start index, open end index, and an empty `processed` map.

In [None]:
# Function to Initialize a New Shard
def init_shard(shard_id: int, start_index: int) -> dict:
    return {
        "schema_version": SCHEMA_VERSION,
        "shard_id": shard_id,
        "start_index": start_index,
        "end_index": None,
        "processed": {},  # id -> {ok, count, retries, ts}
    }

Function `load_shard()` loads shard metadata from its JSON file if it exists, or initializes and returns a new shard starting at `start_index_if_new` otherwise.

In [None]:
# Function to Load a Shard from File, or Initialize New if Not Existent
def load_shard(shard_id: int, start_index_if_new: int) -> dict:
    p = shard_path(shard_id)
    if p.exists():
        with p.open("r", encoding="utf-8") as f:
            return json.load(f)
    return init_shard(shard_id, start_index_if_new)

Function `save_shard()` saves shard metadata to its JSON file using `atomic_write_json` and the shard’s `shard_id`.

In [None]:
# Function to Save a Shard to File
def save_shard(shard: dict) -> None:
    atomic_write_json(shard_path(shard["shard_id"]), shard)

## **Validate the JSON file, normalize the arXiv ID to a folder path, and ensure the ID is in the correct format**

Function `is_valid_references_file()` validates a references JSON file’s existence, structure, and required metadata, returning a `(ok, reason)` tuple.

In [None]:
# Function to validate the references file
def is_valid_references_file(path: Path, min_refs: int = 0, required_fields: set[str] = REQUIRED_FIELDS) -> tuple[bool, str]:
    # Check for file existence
    if not path.exists():
        return False, "file_not_found"

    # Read and validate the JSON file content
    try:
        with path.open("r", encoding="utf-8") as f:
            data = json.load(f)
    except json.JSONDecodeError:
        return False, "json_invalid"
    except OSError as e:
        # fix: use __name__ to get the exception class name
        return False, f"io_error:{type(e).__name__}"

    # Check data structure (must be a dict) and minimum reference count
    if not isinstance(data, dict):
        return False, "root_not_dict"
    if len(data) < min_refs:
        return False, "below_min_refs"

    # Check each sub-entry in the data
    for ref_id, meta in data.items():
        # Check if each entry is a dict
        if not isinstance(meta, dict):
            return False, f"entry_not_dict:{ref_id}"

        # Check for required fields
        missing = required_fields - meta.keys()
        if missing:
            return False, f"missing_fields:{ref_id}:{','.join(sorted(missing))}"

        # Check if Authors is a list of strings
        authors = meta.get("authors")
        if not isinstance(authors, list) or not all(isinstance(a, str) for a in authors):
            return False, f"authors_invalid:{ref_id}"
        # Check if submission_date is a string and not empty (if present)
        sd = meta.get("submission_date")
        if sd is not None and not isinstance(sd, str):
            return False, f"submission_date_not_str:{ref_id}"

        # Check if semantic_scholar_id is a non-empty string
        sid = meta.get("semantic_scholar_id")
        if not sid or not isinstance(sid, str):
            return False, f"paper_id_invalid:{ref_id}"
    return True, "ok"


Function `id_to_folder()` converts an arXiv ID into its storage folder path under `REFS_ROOT` by replacing `.` with `-`.

In [None]:
# Function to convert an arXiv ID to a storage folder
def id_to_folder(arxiv_id: str) -> Path:
    return REFS_ROOT / arxiv_id.replace(".", "-")

Function `validate_arxiv_base_id()` returns `True` if `arxiv_id` matches the base arXiv ID regex `ARXIV_ID_RE`, otherwise `False`.

In [None]:
# Function to validate the arXiv ID format
def validate_arxiv_base_id(arxiv_id: str) -> bool:
    return bool(ARXIV_ID_RE.match(arxiv_id))

## **Extract references from Semantic Scholar**

Function `extract_arxiv_references_to_file()` fetches references for an arXiv paper from the Semantic Scholar API with retries/backoff, saves them as JSON to `out_path`, and returns the number of stored references.

In [11]:
# Function to extract references from Semantic Scholar and save to a file
def extract_arxiv_references_to_file(arxiv_id: str, out_path: Path, session: requests.Session, max_attempts: int = 8, base_backoff: float = 5.0) -> int:
    url = f"https://api.semanticscholar.org/graph/v1/paper/arXiv:{arxiv_id}"

    params = {"fields": "references,references.externalIds,references.title,references.paperId,references.authors,references.publicationDate"}

    backoff = base_backoff
    for attempt in range(max_attempts):
        try:
            resp = session.get(url, params=params, headers=headers, timeout=30)
        except requests.RequestException as e:
            # Network error: slight backoff and retry
            sleep_s = min(120, backoff) + random.uniform(0, 0.3)
            print(f"[NET] {arxiv_id}: {e!r}. Retry in {sleep_s:.1f}s")
            time.sleep(sleep_s)
            backoff *= 2
            continue

        # Success: process data and write to file
        if resp.status_code == 200:
            body = resp.json()
            refs = body.get("references", []) or []
            out: Dict[str, Dict[str, Any]] = {}

            for ref in refs:
                ext = ref.get("externalIds") or {}
                arx = ext.get("ArXiv")
                if not arx:
                    continue
                rid = str(arx).replace(".", "-")

                authors = []
                seen = set()
                for a in ref.get("authors", []):
                    name = (a or {}).get("name")
                    if name and name not in seen:
                        authors.append(name)
                        seen.add(name)

                venue = None

                out[rid] = {
                    "paper_title": ref.get("title"),
                    "authors": authors,
                    "submission_date": ref.get("publicationDate"),
                    "semantic_scholar_id": ref.get("paperId")
                }

            atomic_write_json(out_path, out)
            return len(out)

        # Retryable errors
        if resp.status_code in (429, 500, 502, 503, 504):
            sleep_s = min(180, backoff) + random.uniform(0, 0.5)
            print(f"[{resp.status_code}] {arxiv_id}: backoff {sleep_s:.1f}s (attempt {attempt+1}/{max_attempts})")
            time.sleep(sleep_s)
            backoff *= 2
            continue

        # Non-retryable error
        print(f"[HTTP {resp.status_code}] {arxiv_id}: {resp.text[:200]}")
        break

    # Failed
    raise RuntimeError(f"Fetch failed after {max_attempts} attempts")


## **Handle program shutdown**

Function `extract_arxiv_references_to_file()` fetches references for an arXiv paper from the Semantic Scholar API with retries/backoff, saves them as JSON to `out_path`, and returns the number of stored references.

In [12]:
# Class for graceful program shutdown handling
class GracefulShutdown:
   # Initialize and register signal handlers
    def __init__(self):
        self.stop = False
        for sig in (signal.SIGINT, signal.SIGTERM):
            signal.signal(sig, self._handler)

    # Signal handler (requires 2 parameters: signum, frame)
    def _handler(self, signum, frame):
        print(f"\n[Signal] Caught {signum}. Will stop after current item…")
        self.stop = True

    # Returns the stop status
    @property
    def should_stop(self) -> bool:
        return self.stop

## **Parse command-line arguments for the reference-fetching script**

Class `GracefulShutdown` installs SIGINT/SIGTERM handlers that set an internal stop flag, allowing the program to shut down gracefully after the current item finishes.

In [13]:
def get_args(argv=None):
    ap = argparse.ArgumentParser(description="Fetch references for arXiv IDs from metadata.json with robust checkpointing")
    ap.add_argument("--metadata", default="metadata.json", help="Path metadata.json")
    ap.add_argument("--refs-root", default=str(REFS_ROOT), help="Root folder to save refs")
    ap.add_argument("--start-index", type=int, default=None, help="Start from specific index (override checkpoint)")
    ap.add_argument("--sleep", type=float, default=1.1, help="Sleep between papers to respect rate limit")
    ap.add_argument("--min-refs", type=int, default=0, help="The minimum number of refs to consider output valid (0 = accept empty)")
    ap.add_argument("--max-attempts", type=int, default=8, help="The maximum number of attempts per ID")
    ap.add_argument("--dry-run", action="store_true", help="Don't write refs file; just log and update checkpoint")
    ap.add_argument("--failed-jsonl", default=str(CHECKPOINTS_DIR / "failed.jsonl" ),help="Path JSON save id fail")

    return ap.parse_args(argv)

## **Main process**

Function `main()` coordinates CLI parsing, arXiv ID discovery, checkpointed reference fetching with rate limiting, resource monitoring, and graceful shutdown for the whole extraction pipeline.

In [None]:
def main(argv=None):
    t0 = time.perf_counter()
    global REFS_ROOT

    # Parse command line arguments
    args = get_args(argv)

    # Set up storage directory
    REFS_ROOT = Path(args.refs_root)
    REFS_ROOT.mkdir(parents=True, exist_ok=True)

    # Start memory sampler and disk monitor
    mem_sampler = MemorySampler(interval=0.5, include_children=True)
    disk_watcher = DiskWatcher([REFS_ROOT, CHECKPOINTS_DIR], interval=1.0)

    mem_sampler.start()
    disk_watcher.start()

    t0_discovery = time.perf_counter()
    # Read metadata and collect arXiv IDs
    ids = collect_ids_from_tree(REFS_ROOT)
    discovery_time = time.perf_counter() - t0_discovery

    if not ids:
        return
    print(f"Collected {len(ids)} IDs from '{REFS_ROOT}'.")

    print(f"Collected {len(ids)} IDs from {args.metadata}")
    if not ids:
        return

    # Load HEAD checkpoint shard
    head = load_head()


    # Determine starting point
    if args.start_index is not None:
        start_idx = args.start_index
    else:
        start_idx = head.get("last_index", -1) + 1

    start_idx = max(0, start_idx)

    if start_idx >= len(ids):
        print("[Info] Đã xử lý hết danh sách theo HEAD checkpoint.")
        return

    # Load the current shard
    curr_shard_id = shard_id_for_index(start_idx)
    shard = load_shard(curr_shard_id, start_index_if_new=curr_shard_id * SHARD_SIZE)

    # Record the current shard ID
    head["current_shard_id"] = curr_shard_id
    save_head(head)

    # Create shutdown flag for signal handling when needed
    shutdown = GracefulShutdown()
    paper_times = []
    with requests.Session() as sess:
        sess.headers.update({"User-Agent": USER_AGENT, "Accept": "application/json"})

        for idx in range(start_idx, len(ids)):
            t_paper = time.perf_counter()
            # Check if the current ID needs a new shard file
            sid = shard_id_for_index(idx)
            if sid != curr_shard_id:
                shard["end_index"] = idx - 1
                save_shard(shard)
                curr_shard_id = sid
                shard = load_shard(curr_shard_id, start_index_if_new=curr_shard_id * SHARD_SIZE)
                head["current_shard_id"] = curr_shard_id
                save_head(head)

            arxiv_id = ids[idx]
            folder = id_to_folder(arxiv_id)
            folder.mkdir(parents=True, exist_ok=True)
            out_path = folder / "references.json"

            # Skip if a valid file already exists
            ok_existing, _ = is_valid_references_file(out_path, min_refs=args.min_refs)
            if ok_existing:
                try:
                    cnt = len(json.load(out_path.open("r", encoding="utf-8")))
                except Exception:
                    cnt = -1

                # Mark ID as having a valid file, update failed status if present
                prev_retries = shard["processed"].get(arxiv_id, {}).get("retries", 0)
                shard["processed"][arxiv_id] = {"ok": True, "count": cnt, "retries": prev_retries}
                head["last_index"] = idx
                save_shard(shard)
                save_head(head)
                print(f"[SKIP] {arxiv_id}: references.json đã tồn tại (count={cnt}).")


                if shutdown.should_stop:
                    break
                if idx < len(ids) - 1: time.sleep(max(0.0, args.sleep))
                continue

            print(f"[DO ] ({idx+1}/{len(ids)}) {arxiv_id} → {out_path}")

            # Fetch references from ID
            try:
                if args.dry_run:
                    cnt = 0
                else:
                    cnt = extract_arxiv_references_to_file(
                        arxiv_id, out_path, session=sess, max_attempts=args.max_attempts
                    )

                # Validate the newly written file
                if not args.dry_run:
                    ok_new, reason = is_valid_references_file(out_path, min_refs=args.min_refs)
                    if not ok_new:
                        raise RuntimeError(f"Output not valid or below min_refs={args.min_refs}; reason={reason}")

                # Update indices for shard file and head
                prev_retries = shard["processed"].get(arxiv_id, {}).get("retries", 0)
                shard["processed"][arxiv_id] = {"ok": True, "count": cnt, "retries": prev_retries}
                head["stats"]["ok"] += 1
                print(f"[ OK] {arxiv_id}: {cnt} ref")

            except Exception as e:
                head["stats"]["fail"] += 1
                print(f"[ERR] {arxiv_id}: {e!r}")

                append_jsonl(FAILED_JSONL_PATH, {"arxiv_id": arxiv_id, "error": repr(e)})

            finally:
                head["last_index"] = idx
                save_shard(shard)
                save_head(head)
                mem_sampler.stop()
                disk_watcher.stop()
                mem_sampler.join()
                disk_watcher.join()

            dt = time.perf_counter() - t_paper
            paper_times.append(dt)

            # Sleep duration to adhere to rate limit
            if idx < len(ids) - 1:
                time.sleep(max(0.0, args.sleep))

            if shutdown.should_stop:
                print("[Stop] Graceful shutdown requested.")
                break


    # Finalization: close the current shard
    shard["end_index"] = max(shard.get("end_index") or -1, head["last_index"])
    save_shard(shard)
    print(f"Done. HEAD last_index={head['last_index']} | shard={head['current_shard_id']:06d} | OK={head['stats']['ok']} FAIL={head['stats']['fail']}")

    wall_time = time.perf_counter() - t0
    print(f"Wall time: {wall_time:.3f}s")
    print(f"Discovery time : {discovery_time:.3f}s")

    if paper_times:
        avg_time_per_paper = sum(paper_times) / len(paper_times)
        print(
            f"Per-paper time : mean={avg_time_per_paper:.3f}s "
            f"min={min(paper_times):.3f}s max={max(paper_times):.3f}s"
        )
    else:
        print("Per-paper time : mean=0.000s min=N/A max=N/A (no papers)")

    print(
        f"[MEM] peak={mem_sampler.peak_bytes / (1024**2):.1f} MiB | "
        f"avg={mem_sampler.avg_bytes() / (1024**2):.1f} MiB (time-weighted)"
    )

    if disk_watcher.initial_bytes is not None:
        print(
            "[DISK] init={:.1f} MiB | final={:.1f} MiB | peak={:.1f} MiB".format(
                disk_watcher.initial_bytes / (1024**2),
                disk_watcher.final_bytes / (1024**2),
                disk_watcher.peak_bytes / (1024**2),
            )
        )

In [15]:
main([])


Collected 1501 IDs from '/content/drive/MyDrive/[Project] Milestone 1/workdir/metadata and reference'.
Collected 1501 IDs from metadata.json
[DO ] (1/1501) 2211.14747 → /content/drive/MyDrive/[Project] Milestone 1/workdir/metadata and reference/2211-14747/references.json
[ OK] 2211.14747: 3 ref
[DO ] (2/1501) 2211.14745 → /content/drive/MyDrive/[Project] Milestone 1/workdir/metadata and reference/2211-14745/references.json
[ OK] 2211.14745: 22 ref
[DO ] (3/1501) 2211.14744 → /content/drive/MyDrive/[Project] Milestone 1/workdir/metadata and reference/2211-14744/references.json
[ OK] 2211.14744: 5 ref
[DO ] (4/1501) 2211.14743 → /content/drive/MyDrive/[Project] Milestone 1/workdir/metadata and reference/2211-14743/references.json
[ OK] 2211.14743: 5 ref
[DO ] (5/1501) 2211.14742 → /content/drive/MyDrive/[Project] Milestone 1/workdir/metadata and reference/2211-14742/references.json
[ OK] 2211.14742: 39 ref
[DO ] (6/1501) 2211.14741 → /content/drive/MyDrive/[Project] Milestone 1/workdir/m