In [1]:
import requests
from xml.etree import ElementTree as ET

def get_quantph_total():
    url = "http://export.arxiv.org/api/query"
    params = {
        "search_query": "cat:quant-ph",
        "start": 0,
        "max_results": 0  # only return metadata, not entries
    }
    resp = requests.get(url, params=params)
    root = ET.fromstring(resp.content)
    ns = {'arxiv': 'http://a9.com/-/spec/opensearch/1.1/'}
    total_results = root.find('arxiv:totalResults', ns)
    return int(total_results.text)

total = get_quantph_total()
print(f"Total quant-ph papers: {total}")


Total quant-ph papers: 163070


In [13]:
import requests
import sqlite3
import time
from typing import Dict, List, Tuple
from tqdm import tqdm
DB_PATH = "arxiv_citation.db"
NUM_PAPERS = 163070

# -------------------------------
# STEP 1: Fetch arXiv IDs
# -------------------------------
def fetch_arxiv_ids(max_results: int) -> List[str]:
    base_url = "http://export.arxiv.org/api/query"
    ids = []
    for start in tqdm(range(0, max_results, 100),desc="Fetching arXiv IDs"):
        params = {
            "search_query": f"cat:quant-ph",
            "start": start,
            "max_results": 100,
            "sortBy": "submittedDate",
            "sortOrder": "descending",
        }
        resp = requests.get(base_url, params=params)
        entries = resp.text.split("<entry>")
        for entry in entries[1:]:
            try:
                id_line = next(line for line in entry.split("\n") if "<id>" in line)
                url = id_line.strip().replace("<id>", "").replace("</id>", "")
                arxiv_id = url.split("/")[-1]
                ids.append(arxiv_id)
            except StopIteration:
                continue
        if len(ids) >= max_results:
            break
    return ids

arxiv_ids = fetch_arxiv_ids( NUM_PAPERS)

Fetching arXiv IDs: 100%|██████████| 1631/1631 [17:32<00:00,  1.55it/s]


In [None]:
# -------------------------------
# STEP 2: Fetch Metadata + Citations from Semantic Scholar
# -------------------------------
def fetch_citation_edges(arxiv_ids: List[str]) -> Tuple[Dict[str, dict], List[Tuple[str, str]]]:
    edges = []
    paper_data = {}
    headers = {"User-Agent": "qubit-citation-crawler"}

    for arxiv_id in arxiv_ids:
        s2_url = f"https://api.semanticscholar.org/graph/v1/paper/arXiv:{arxiv_id}?fields=title,year,references.paperId"
        try:
            resp = requests.get(s2_url, headers=headers)
            if resp.status_code != 200:
                continue
            data = resp.json()
            paper_data[data['paperId']] = {
                "arxiv_id": arxiv_id,
                "title": data.get("title", ""),
                "year": data.get("year", None)
            }
            refs = data.get("references", [])
            for ref in refs:
                ref_id = ref.get("paperId")
                if ref_id:
                    edges.append((data['paperId'], ref_id))
            time.sleep(1)  # to respect API rate limits
        except Exception as e:
            print(f"Error fetching {arxiv_id}: {e}")
            continue

    return paper_data, edges

print("Fetching metadata and references from Semantic Scholar...")
paper_data, edges = fetch_citation_edges(arxiv_ids)
print(f"Fetched metadata for {len(paper_data)} papers and {len(edges)} citation edges.")

In [18]:
s2_url = f"https://api.semanticscholar.org/graph/v1/paper/arXiv:{arxiv_ids[3]}?fields=title,year,references.paperId"
resp = requests.get(s2_url, headers={"User-Agent": "qubit-citation-crawler"})


In [19]:
resp, resp.status_code

(<Response [429]>, 429)

In [20]:
data = resp.json()
data

{'message': 'Too Many Requests. Please wait and try again or apply for a key for higher rate limits. https://www.semanticscholar.org/product/api#api-key-form',
 'code': '429'}

In [None]:

paper_data[data['paperId']] = {
    "arxiv_id": arxiv_ids[3],
    "title": data.get("title", ""),
    "year": data.get("year", None)
}
refs = data.get("references", [])
for ref in refs:
    ref_id = ref.get("paperId")
    if ref_id:
        edges.append((data['paperId'], ref_id))


In [None]:
# -------------------------------
# STEP 3: Store in SQLite
# -------------------------------
def create_db(db_path: str = DB_PATH):
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    c.execute("""
        CREATE TABLE IF NOT EXISTS papers (
            paper_id TEXT PRIMARY KEY,
            arxiv_id TEXT,
            title TEXT,
            year INTEGER
        )
    """)
    c.execute("""
        CREATE TABLE IF NOT EXISTS citations (
            source_id TEXT,
            target_id TEXT,
            PRIMARY KEY (source_id, target_id)
        )
    """)
    conn.commit()
    return conn

def insert_data(conn, paper_data: Dict[str, dict], edges: List[Tuple[str, str]]):
    c = conn.cursor()
    for pid, meta in paper_data.items():
        c.execute("INSERT OR IGNORE INTO papers VALUES (?, ?, ?, ?)",
                  (pid, meta.get("arxiv_id", ""), meta["title"], meta["year"]))
    for src, tgt in edges:
        c.execute("INSERT OR IGNORE INTO citations VALUES (?, ?)", (src, tgt))
    conn.commit()

print("Saving to SQLite database...")
conn = create_db()
insert_data(conn, paper_data, edges)
print("Done.")