In [None]:
# !pip -q install arxiv pandas tqdm python-slugify

In [None]:
from pathlib import Path
import re
import pandas as pd
from tqdm import tqdm
from slugify import slugify
import arxiv
import urllib.request

# Make folders if not 
BASE_DIR = Path(".")
PDF_DIR = BASE_DIR / "data" / "raw" / "papers"
META_DIR = BASE_DIR / "data" / "metadata"
PDF_DIR.mkdir(parents=True, exist_ok=True)
META_DIR.mkdir(parents=True, exist_ok=True)

TARGET_MIN = 60          
TARGET_MAX = 120         
YEAR_FROM = 2020
YEAR_TO = 2026

# Query focused on hallucination & faithfulness 
QUERY = """
(
  hallucination OR faithful OR faithfulness OR groundedness OR attribution OR
  factual consistency OR "factuality" OR "hallucinated"
)
AND
(
  "large language model" OR LLM OR "language model"
)
"""

# common arXiv categories for LLM papers, to avoid mathematical/physics papers
CATEGORIES = ["cs.CL", "cs.AI", "cs.IR", "cs.LG"]

# rewrite filename to be more filesystem-friendly
def safe_filename(year: int, first_author: str, title: str, arxiv_id: str) -> str:
    # keep short + stable
    author = slugify(first_author, lowercase=True)[:24]
    short_title = slugify(title, lowercase=True)[:60]
    arx = re.sub(r"[^0-9.]", "", arxiv_id)  # keep digits/dots
    return f"{year}_{author}_{short_title}_{arx}.pdf"


In [10]:
client = arxiv.Client(page_size=50, delay_seconds=3, num_retries=3)

search = arxiv.Search(
    query=f"({QUERY}) AND (cat:{' OR cat:'.join(CATEGORIES)})",
    max_results=TARGET_MAX,

    # get newest papers first
    sort_by=arxiv.SortCriterion.SubmittedDate,
    sort_order=arxiv.SortOrder.Descending,
)

results = []
for r in client.results(search):

    # filter by year
    year = r.published.year
    if not (YEAR_FROM <= year <= YEAR_TO):
        continue

    first_author = r.authors[0].name if r.authors else "unknown"
    arxiv_id = r.get_short_id()  # e.g. 2401.12345v2
    pdf_url = r.pdf_url

    # Metadata extraction
    results.append({
        "paper_id": arxiv_id,
        "title": r.title.strip().replace("\n", " "),
        "year": year,
        "first_author": first_author,
        "authors": ", ".join([a.name for a in r.authors])[:1000],
        "published": r.published.isoformat(),
        "updated": r.updated.isoformat() if r.updated else "",
        "categories": ", ".join(r.categories),
        "summary": r.summary.strip().replace("\n", " "),
        "pdf_url": pdf_url,
        "arxiv_url": r.entry_id,
    })

len(results), results[0]["title"] if results else None


(120, 'Reinforced Fast Weights with Next-Sequence Prediction')

In [None]:
rows = []
downloaded = 0

# Download PDFs and prepare metadata rows
for item in tqdm(results, total=len(results)):
    fname = safe_filename(item["year"], item["first_author"], item["title"], item["paper_id"])
    pdf_path = PDF_DIR / fname

    item["file_name"] = fname
    item["file_path"] = str(pdf_path)

    # download only if not exists
    if not pdf_path.exists():
        try:
            urllib.request.urlretrieve(item["pdf_url"], pdf_path)
            downloaded += 1
        except Exception as e:
            item["download_error"] = str(e)
    rows.append(item)

df = pd.DataFrame(rows)

# Keep only successfully downloaded PDFs
df_ok = df[df["file_path"].apply(lambda p: Path(p).exists())].copy()

out_csv = META_DIR / "papers.csv"
df_ok.to_csv(out_csv, index=False)

print("Downloaded:", downloaded)
print("Total available locally:", len(df_ok))
print("Saved metadata:", out_csv)
df_ok.head(3)


100%|██████████| 120/120 [06:39<00:00,  3.33s/it]

Downloaded: 120
Total available locally: 120
Saved metadata: data\metadata\papers.csv





Unnamed: 0,paper_id,title,year,first_author,authors,published,updated,categories,summary,pdf_url,arxiv_url,file_name,file_path
0,2602.16704v1,Reinforced Fast Weights with Next-Sequence Pre...,2026,Hee Seung Hwang,"Hee Seung Hwang, Xindi Wu, Sanghyuk Chun, Olga...",2026-02-18T18:53:18+00:00,2026-02-18T18:53:18+00:00,cs.CL,Fast weight architectures offer a promising al...,https://arxiv.org/pdf/2602.16704v1,http://arxiv.org/abs/2602.16704v1,2026_hee-seung-hwang_reinforced-fast-weights-w...,data\raw\papers\2026_hee-seung-hwang_reinforce...
1,2602.16671v1,SPARC: Scenario Planning and Reasoning for Aut...,2026,Jaid Monwar Chowdhury,"Jaid Monwar Chowdhury, Chi-An Fu, Reyhaneh Jab...",2026-02-18T18:09:03+00:00,2026-02-18T18:09:03+00:00,"cs.SE, cs.AI",Automated unit test generation for C remains a...,https://arxiv.org/pdf/2602.16671v1,http://arxiv.org/abs/2602.16671v1,2026_jaid-monwar-chowdhury_sparc-scenario-plan...,data\raw\papers\2026_jaid-monwar-chowdhury_spa...
2,2602.16660v1,"Align Once, Benefit Multilingually: Enforcing ...",2026,Yuyan Bu,"Yuyan Bu, Xiaohao Liu, ZhaoXing Ren, Yaodong Y...",2026-02-18T18:01:23+00:00,2026-02-18T18:01:23+00:00,"cs.CL, cs.AI, cs.LG",The widespread deployment of large language mo...,https://arxiv.org/pdf/2602.16660v1,http://arxiv.org/abs/2602.16660v1,2026_yuyan-bu_align-once-benefit-multilinguall...,data\raw\papers\2026_yuyan-bu_align-once-benef...


In [None]:
# Check duplicate paper ids
dups = df_ok["paper_id"].duplicated().sum()
print("Duplicate paper_id:", dups)

# Check by year distribution
print(df_ok["year"].value_counts().sort_index())

Duplicate paper_id: 0
year
2026    120
Name: count, dtype: int64
