In [None]:
from langchain.agents import initialize_agent, AgentType, Tool
from langchain.memory import ConversationBufferMemory

import sys
# sys.path.append("../")
from configs import config
from configs.llm_provider import get_llm
from research_trend_analyzer.tools.paper_fetch_tools import paper_fetch_toolkit
from research_trend_analyzer.tools.paper_summary_tools import paper_analyze_toolkit
from langchain_experimental.plan_and_execute import PlanAndExecute, load_agent_executor, load_chat_planner

# 1. Initialize LLM
config.LLM_PROVIDER = 'gemini'
llm = get_llm(config)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# 1. Create the planner (LLM decides on multi-step plan)
planner = load_chat_planner(llm)

# 2. Create the executor (agent capable of tool execution)
executor = load_agent_executor(
    llm=llm,
    tools=paper_fetch_toolkit+paper_analyze_toolkit,
    verbose=True
)

# 3. Combine into plan-and-execute agent
master_agent = PlanAndExecute(
    planner=planner,
    executor=executor,
    memory=memory,
    verbose=True
)

# # 2. Create fetch and analyze agents
# paper_fetch_agent = initialize_agent(
#     tools=paper_fetch_toolkit,
#     llm=llm,
#     agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
#     verbose=True
# )

# paper_analyze_agent = initialize_agent(
#     tools=paper_analyze_toolkit,
#     llm=llm,
#     agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
#     verbose=True
# )

  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [None]:
from research_trend_analyzer.configs.log_config import configure_logging
configure_logging()  # Make sure logging is set up first


In [3]:
master_agent.run("filter privacy-related papers from NIPS 2023, write a summary for each of the paper")

  master_agent.run("filter privacy-related papers from NIPS 2023, write a summary for each of the paper")




[1m> Entering new PlanAndExecute chain...[0m


ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 250
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 14
}
]

In [24]:
steps=[Step(value='Access the official NeurIPS 2023 proceedings or paper list.'), Step(value='Search the paper titles and abstracts using keywords such as "privacy," "differential privacy," "federated learning," "anonymity," "confidentiality," and "privacy-preserving."'), Step(value='Review the search results to identify papers primarily focused on privacy.'), Step(value='Select 10 distinct and relevant privacy-related papers from the identified list.'), Step(value='Present the titles and, if possible, authors or links for the 10 selected papers.'), Step(value='Given the above steps taken, please respond to the users original question.')]


NameError: name 'Step' is not defined

In [None]:
import pymupdf
from research_trend_analyzer_light.utils.paper_process import download_pdf

In [3]:
resp = download_pdf("https://www.dfki.de/fileadmin/user_upload/import/5224_paper12.pdf")

In [6]:
doc = pymupdf.open(resp['data']['path'])

In [16]:
doc

Document('temp/paper.pdf')

In [8]:
text = ''
for page in doc:
    text += page.get_text()

In [11]:
from pprint import pprint
len(text)

43265

In [12]:
from configs.llm_provider import get_text_splitter

In [13]:
from configs import config

In [14]:
text_splitter = get_text_splitter(config)

In [20]:
from langchain_core.documents import Document
document = [Document(text)]

In [21]:
document

[Document(metadata={}, page_content='Advances in Deep Parsing of Scholarly Paper\nContent\nUlrich Sch¨afer and Bernd Kiefer\nLanguage Technology Lab\nGerman Research Center for Artiﬁcial Intelligence (DFKI)\nCampus D3 1, D-66123 Saarbr¨ucken, Germany\n{ulrich.schaefer,kiefer}@dfki.de\nhttp://www.dfki.de/lt\nAbstract. We report on advances in deep linguistic parsing of the full\ntextual content of 8200 papers from the ACL Anthology, a collection of\nelectronically available scientiﬁc papers in the ﬁelds of Computational\nLinguistics and Language Technology.\nWe describe how – by incorporating new techniques – we increase both\nspeed and robustness of deep analysis, speciﬁcally on long sentences\nwhere deep parsing often failed in former approaches. With the current\nopen source HPSG (Head-driven phrase structure grammar) for English\n(ERG), we obtain deep parses for more than 85% of the sentences in the\n1.5 million sentences corpus, while the former approaches achieved only\napprox. 65

In [22]:
split_chunks = text_splitter.split_documents(document)

In [23]:
split_chunks

[Document(metadata={}, page_content='Advances in Deep Parsing of Scholarly Paper\nContent\nUlrich Sch¨afer and Bernd Kiefer\nLanguage Technology Lab\nGerman Research Center for Artiﬁcial Intelligence (DFKI)\nCampus D3 1, D-66123 Saarbr¨ucken, Germany\n{ulrich.schaefer,kiefer}@dfki.de\nhttp://www.dfki.de/lt\nAbstract. We report on advances in deep linguistic parsing of the full\ntextual content of 8200 papers from the ACL Anthology, a collection of\nelectronically available scientiﬁc papers in the ﬁelds of Computational\nLinguistics and Language Technology.\nWe describe how – by incorporating new techniques – we increase both\nspeed and robustness of deep analysis, speciﬁcally on long sentences'),
 Document(metadata={}, page_content='We describe how – by incorporating new techniques – we increase both\nspeed and robustness of deep analysis, speciﬁcally on long sentences\nwhere deep parsing often failed in former approaches. With the current\nopen source HPSG (Head-driven phrase structur

In [29]:
from research_trend_analyzer_light.utils.paper_process import download_pdf, parse_pdf

download_pdf("https://papers.nips.cc/paper_files/paper/2020/file/021bbc7ee20b71134d53e20206bd6feb-Paper.pdf", paper_path = "temp/pdfs/test.pdf")


{'status': 'success',
 'message': 'Downloaded PDF successfully.',
 'data': {'pdf_url': 'https://papers.nips.cc/paper_files/paper/2020/file/021bbc7ee20b71134d53e20206bd6feb-Paper.pdf',
  'path': 'temp/pdfs/test.pdf',
  'bytes': 13026651}}

In [32]:
import pymupdf
paper_path = "temp/pdfs/test.pdf"
with pymupdf.open(paper_path) as doc:
    for i, page in enumerate(doc):
        print(i, page.search_for("References"))


0 []
1 []
2 []
3 []
4 []
5 []
6 []
7 []
8 []
9 [Rect(108.0, 464.3731384277344, 163.54383850097656, 476.3283386230469)]
10 []
11 []
12 []


In [35]:
import re
from typing import Optional, Tuple, Dict
import fitz  # PyMuPDF


# --- Section title patterns (extend as needed) ---
ACK_TITLES = [r"acknowledg(e)?ment(s)?"]  # acknowledgment / acknowledgement / acknowledgments
REF_TITLES = [r"references?", r"bibliography", r"works\s+cited"]

ACK_RE = re.compile(rf"^\s*({'|'.join(ACK_TITLES)})\s*$", re.I)
REF_RE = re.compile(rf"^\s*({'|'.join(REF_TITLES)})\s*$", re.I)

# Generic "looks like a heading" (not strictly required here, but useful if you extend logic)
GEN_HEADING_RE = re.compile(r"^[A-Z0-9][A-Z0-9\s\-:&]{2,}$")


def _first_anchor_on_page(page: fitz.Page, title_re: re.Pattern, title_variants) -> Optional[Tuple[float, str]]:
    """
    Try to find a heading match on a page; return (y0, matched_text) if found.
    Strategy:
      1) font-aware scan via get_text('dict') to match exact heading lines
      2) fallback to search_for() for string variants (returns Rects)
    """
    info = page.get_text("dict")
    best = None
    best_size = -1.0

    for block in info.get("blocks", []):
        if block.get("type") != 0:
            continue
        for line in block.get("lines", []):
            text = "".join(span.get("text", "") for span in line.get("spans", [])).strip()
            if not text:
                continue
            if title_re.match(text):
                sizes = [span.get("size", 0) for span in line.get("spans", [])]
                avg_size = sum(sizes) / len(sizes) if sizes else 0
                y0 = line["spans"][0]["bbox"][1]
                if avg_size > best_size:
                    best = (y0, text)
                    best_size = avg_size

    if best:
        return best

    # Fallback: literal search for common variants
    for variant in title_variants:
        hits = page.search_for(variant)  # returns list[Rect]
        if hits:
            return (hits[0].y0, variant)

    return None


def _find_earliest_anchor(doc: fitz.Document) -> Optional[Tuple[int, float, str]]:
    """
    Scan pages forward to find the earliest of Acknowledgments or References.
    Returns (page_index, y_start, label) where label is 'acknowledgments' or 'references'.
    """
    for pno in range(doc.page_count):
        page = doc[pno]
        # Check ACK
        a = _first_anchor_on_page(page, ACK_RE, ["Acknowledgments", "Acknowledgements", "Acknowledgment"])
        if a:
            y, _ = a
            return (pno, y, "acknowledgments")
        # Check REF
        r = _first_anchor_on_page(page, REF_RE, ["References", "Bibliography", "Works Cited"])
        if r:
            y, _ = r
            return (pno, y, "references")
    return None


def _page_text_up_to_y(page: fitz.Page, y_limit: float) -> str:
    """
    Extract text from the page strictly ABOVE y_limit, using line-level positions.
    """
    info = page.get_text("dict")
    out_lines = []
    for block in info.get("blocks", []):
        if block.get("type") != 0:
            continue
        for line in block.get("lines", []):
            # take line's first span y0 as line baseline
            if not line.get("spans"):
                continue
            y0 = line["spans"][0]["bbox"][1]
            if y0 < y_limit - 1e-3:
                text = "".join(span.get("text", "") for span in line.get("spans", [])).rstrip()
                if text:
                    out_lines.append(text)
    return "\n".join(out_lines).strip()


def extract_until_ack_or_refs(pdf_path: str, include_anchor_page: bool = True) -> Dict[str, Optional[str]]:
    """
    Extract main text from the start of the PDF until the first occurrence of
    Acknowledgments/References.

    If include_anchor_page=True:
        - include the entire page that contains the detected section
        - then stop (exclude subsequent pages)

    If include_anchor_page=False:
        - stop right before the heading on that page (slice by y)

    Returns:
        {
          "text": <extracted_text or None>,
          "stop_section": "acknowledgments" | "references" | None,
          "stop_page": <int or None>
        }
    """
    with fitz.open(pdf_path) as doc:
        if doc.page_count == 0:
            return {"text": None, "stop_section": None, "stop_page": None}

        anchor = _find_earliest_anchor(doc)

        # No anchor? Return all text
        if anchor is None:
            full = []
            for pno in range(doc.page_count):
                full.append(doc[pno].get_text("text"))
            return {"text": "\n".join(full).strip(), "stop_section": None, "stop_page": None}

        stop_page, y_anchor, label = anchor

        # Collect all pages before stop_page
        parts = [doc[p].get_text("text") for p in range(stop_page)]

        # Handle stop_page
        if include_anchor_page:
            parts.append(doc[stop_page].get_text("text"))
        else:
            parts.append(_page_text_up_to_y(doc[stop_page], y_anchor))

        return {
            "text": "\n".join(parts).strip(),
            "stop_section": label,
            "stop_page": stop_page,
        }


In [None]:
def parse_pdf(paper_path: str):
    """
        Parse text from a local PDF at `paper_path`, and return the text. 

        Args:
            paper_path (str): Absolute or relative path to the PDF file on disk.
    """
    if not isinstance(paper_path, str) or not paper_path.strip():
        return make_response("error", "No paper_path provided.", None)

    try:
        with pymupdf.open(paper_path) as doc:
            text = "".join(page.get_text() for page in doc)
            page_count = doc.page_count
        return make_response("success", f"Parsed {page_count} page(s).",
                                         {"text": text, "page_count": page_count})
    except Exception as e:
        return make_response("error", f"Failed to parse PDF: {e}", None)


In [None]:
import os
import re
from typing import Optional
import fitz  # PyMuPDF
from utils.helper_func import make_response
from research_trend_analyzer_light.utils.paper_process import download_pdf

# Compile once
ACK_PAT = re.compile(r"^\s*acknowledg(e)?ment(s)?\s*$", re.IGNORECASE)
REF_PAT = re.compile(r"^\s*(references?|bibliography|works\s+cited)\s*$", re.IGNORECASE)

def validate_pdf_path(pdf_path: str) -> None:
    if not isinstance(pdf_path, str) or not pdf_path.strip():
        raise ValueError("pdf_path must be a non-empty string.")
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"File not found: {pdf_path}")
    if not os.path.isfile(pdf_path):
        raise ValueError(f"Not a file: {pdf_path}")
    if not pdf_path.lower().endswith(".pdf"):
        raise ValueError("The provided path does not have a .pdf extension.")

def parse_pdf(
    pdf_path: str,
    include_anchor_page: bool = False,  # <- default now EXCLUDES text after the heading
    early_stop: bool = True,
):
    """
    Extract text from the PDF. By default, stops BEFORE the first line that looks like an
    'Acknowledgments' or 'References' heading (case-insensitive, whole-line match) and
    EXCLUDES that heading and everything after it on that page.

    Returns via make_response(status, message, data) where data is the extracted text or None.
    """
    try:
        validate_pdf_path(pdf_path)
    except Exception as e:
        return make_response("error", str(e), None)

    try:
        with fitz.open(pdf_path) as doc:
            page_count = doc.page_count
            if page_count == 0:
                return make_response("warning", "Empty PDF (0 pages).", None)

            # If not stopping early, return the entire document text
            if not early_stop:
                try:
                    text = "".join(doc[pno].get_text("text") for pno in range(page_count))
                except Exception as e:
                    return make_response("error", f"Failed to extract text: {e}", None)
                return make_response("success", f"Parsed {page_count} page(s).", text)

            # early_stop=True: stop at first matching heading
            parts = []
            stop_section: Optional[str] = None
            stop_page: Optional[int] = None

            for pno in range(page_count):
                try:
                    page_text = doc[pno].get_text("text")
                except Exception as e:
                    return make_response("error", f"Failed to read page {pno}: {e}", None)

                lines = page_text.splitlines()

                hit_idx = None
                hit_label = None
                for i, ln in enumerate(lines):
                    if ACK_PAT.match(ln):
                        hit_idx, hit_label = i, "acknowledgments"
                        break
                    if REF_PAT.match(ln):
                        hit_idx, hit_label = i, "references"
                        break

                if hit_idx is None:
                    parts.append(page_text)
                    continue

                # Found a stop section on this page
                stop_section = hit_label
                stop_page = pno

                if include_anchor_page:
                    # Include entire anchor page, then stop
                    parts.append(page_text)
                else:
                    # EXCLUDE the heading and everything after it on this page
                    before = "\n".join(lines[:hit_idx]).rstrip()
                    parts.append(before)

                break  # stop after handling the anchor page

            text_out = "\n".join(parts).strip() if parts else None

            if stop_section is not None and stop_page is not None:
                return make_response(
                    "success",
                    f"Stopped at {stop_section} on page {stop_page}",
                    text_out,
                )
            else:
                # No stop section found — return everything read
                return make_response(
                    "success",
                    "Reached end of document without finding a stop section.",
                    text_out,
                )

    except Exception as e:
        return make_response("error", f"Failed to parse PDF: {e}", None)


In [13]:
paths = [
    "https://papers.nips.cc/paper_files/paper/2020/file/01e00f2f4bfcbb7505cb641066f2859b-Paper.pdf",
    "https://arxiv.org/pdf/2410.12336", 
    "https://petsymposium.org/popets/2025/popets-2025-0003.pdf",
    "https://petsymposium.org/popets/2025/popets-2025-0002.pdf",
         ]

for path in paths:
    download_pdf(path, paper_path = "temp/pdfs/test.pdf")
    res = parse_pdf("temp/pdfs/test.pdf")
    if res['status'] == 'success':
        print(res['message'])
        print(res['data'][-500:])

    print("*"*20)


Stopped at acknowledgments on page 8
ity from data sets about people while offering formal
guarantees of privacy to individuals who contribute data. While these beneﬁts are largely positive,
unintended harms could arise due to misapplication of differential privacy or misconceptions about its
guarantees. Additionally, difﬁcult social choices are faced when deciding how to balance privacy and
utility. Our work addresses a foundational differential privacy task and enables better utility-privacy
tradeoffs within this broader context.
********************
Stopped at references on page 9
ess
permission using smart contracts and efficient trail of data access [1].
While our study observed many gaps between the user’s privacy
goals and the data practices by service providers, our findings call
for interdisciplinary research to complement different approaches at
system and design levels to design privacy inclusive IoT solutions.
We hope this paper helps to guide the directions for future rese

In [7]:
print(res['text'])

NameError: name 'res' is not defined

In [1]:
from utils.helper_func import parse_markdown_summary, load_md_file

md_file = load_md_file("papers/paper_summary/neurips_2020/CH/adversarially_robust_streaming_algorithms_via_differential_privacy.md")


In [2]:
content = md_file['data']

In [5]:
res = parse_markdown_summary(content)

In [6]:
res['data'].keys()

dict_keys(['Paper Info', 'Brief Summary', 'Detailed Summary'])

In [7]:
res['data']['Paper Info'].keys()

dict_keys(['Title', 'Authors', 'Affiliations'])

In [8]:
res['data']['Brief Summary'].keys()

dict_keys(['Highlight', 'Keywords'])

In [None]:
import json

json.dumps()

'["Ryan Mckenna", "Daniel R. Sheldon"]'

In [12]:
a = ['Ryan Mckenna', 'Daniel R. Sheldon']
f"[{', '.join(a)}]"

'[Ryan Mckenna, Daniel R. Sheldon]'

In [28]:
import arxiv
from datetime import datetime, timedelta, timezone


client = arxiv.Client()

In [31]:
query = (
    '(ti:"usable privacy" OR abs:"usable privacy") '
    'AND (cat:cs.CR OR cat:cs.CY OR cat:cs.HC OR cat:cs.LG OR cat:stat.ML)'
)

# 2) Ask arXiv for the newest submissions first.
search = arxiv.Search(
    query=query,
    max_results=200,  # pull a larger pool, we’ll filter for recency below if you want
    sort_by=arxiv.SortCriterion.SubmittedDate,
    sort_order=arxiv.SortOrder.Descending,
)

# 3) Fetch results (newer arxiv library style).
client = arxiv.Client(page_size=100, delay_seconds=3, num_retries=3)
results = list(client.results(search))

# Optional: keep only papers from, say, the last 12 months.
cutoff = datetime.now(timezone.utc) - timedelta(days=90)
recent = [r for r in results if r.published and r.published >= cutoff]

# Demo print
for r in recent[:20]:
    print(r.published.date(), "-", r.title)

2025-08-22 - SafeSpace: An Integrated Web Application for Digital Safety and Emotional Well-being
2025-07-03 - Rethinking Data Protection in the (Generative) Artificial Intelligence Era


In [26]:
results = client.results(search)

In [27]:
list(results)

[arxiv.Result(entry_id='http://arxiv.org/abs/2509.05382v1', updated=datetime.datetime(2025, 9, 5, 1, 1, 21, tzinfo=datetime.timezone.utc), published=datetime.datetime(2025, 9, 5, 1, 1, 21, tzinfo=datetime.timezone.utc), title="User Privacy and Large Language Models: An Analysis of Frontier Developers' Privacy Policies", authors=[arxiv.Result.Author('Jennifer King'), arxiv.Result.Author('Kevin Klyman'), arxiv.Result.Author('Emily Capstick'), arxiv.Result.Author('Tiffany Saade'), arxiv.Result.Author('Victoria Hsieh')], summary="Hundreds of millions of people now regularly interact with large language\nmodels via chatbots. Model developers are eager to acquire new sources of\nhigh-quality training data as they race to improve model capabilities and win\nmarket share. This paper analyzes the privacy policies of six U.S. frontier AI\ndevelopers to understand how they use their users' chats to train models.\nDrawing primarily on the California Consumer Privacy Act, we develop a novel\nqualit

In [44]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import logging
import re
import sys
from abc import ABC, abstractmethod
from typing import Any, Dict, List
from urllib.parse import urljoin
from tqdm import tqdm

# ------------------------- helpers -------------------------

def make_response(status: str, message: str, data: Any) -> Dict[str, Any]:
    return {"status": status, "message": message, "data": data}

def write_jsonl(path: str, records: List[Dict[str, Any]]) -> None:
    with open(path, "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

# ------------------------- Base -------------------------

class BaseFetcher(ABC):
    SITE: str = "Base"
    HEADERS: Dict = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/124.0.0.0 Safari/537.36"
        ),
        "Accept": "application/pdf,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    }

    def fetch(self, year: int) -> Dict[str, Any]:
        """Public API: validate → scrape → wrap in make_response."""
        try:
            y = self._validate_year(year)
            data = self._scrape(y, self.HEADERS)
            if not data:
                return make_response("success", f"No papers found for {self.SITE} {y}.", [])
            return make_response("success", f"Fetched {len(data)} papers from {self.SITE} {y}.", data)
        except Exception as e:
            return make_response("error", f"{self.SITE} error: {e}", None)

    # -------- hooks for subclasses --------
    @abstractmethod
    def _scrape(self, year: int, headers: Dict[str, str]) -> List[Dict[str, Any]]:
        ...

    # -------- shared utilities --------
    def _validate_year(self, year: int) -> int:
        try:
            y = int(year)
        except Exception:
            raise ValueError("year must be an integer.")
        if not (1900 <= y <= 2100):
            raise ValueError(f"year out of expected range: {y}.")
        return y

    def _fetch_html(self, url: str, headers: Dict[str, str], label: str, timeout: int = 30) -> str:
        try:
            import requests
        except Exception as e:
            raise RuntimeError(f"Missing dependency 'requests': {e}")
        try:
            resp = requests.get(url, headers=headers, allow_redirects=True, timeout=timeout)
        except requests.RequestException as e:
            raise RuntimeError(f"Network error fetching {label}: {e}")
        except Exception as e:
            raise RuntimeError(f"Unexpected error fetching {label}: {e}")
        if resp.status_code != 200:
            raise RuntimeError(f"HTTP {resp.status_code} fetching {url}")
        ctype = (resp.headers.get("Content-Type") or "").lower()
        if "html" not in ctype:
            logging.warning("Expected HTML but got Content-Type=%s for %s", ctype, url)
        return resp.text

    def _soup(self, html: str):
        try:
            from bs4 import BeautifulSoup
        except Exception as e:
            raise RuntimeError(f"Missing dependency 'beautifulsoup4': {e}")
        try:
            return BeautifulSoup(html, "html.parser")
        except Exception as e:
            raise RuntimeError(f"Failed to parse HTML: {e}")

    def _split_authors(self, text: str) -> List[str]:
        parts = re.split(r",|\band\b", text or "", flags=re.IGNORECASE)
        return [re.sub(r"\s+", " ", p).strip() for p in parts if p and p.strip()]

    # Optional override per site
    def _html_to_pdf_link(self, url: str) -> str:
        return url or ""

# ------------------------- USENIX Security -------------------------

class UsenixSecurityFetcher(BaseFetcher):
    SITE = "USENIX Security"
    BASE = "https://www.usenix.org"

    def _sessions_url(self, year: int) -> str:
        yy = f"{year % 100:02d}"
        return f"{self.BASE}/conference/usenixsecurity{yy}/technical-sessions"

    def _scrape(self, year: int, headers: Dict[str, str]) -> List[Dict[str, Any]]:
        # 1) collect presentation page URLs for the year
        sessions_url = self._sessions_url(year)
        html = self._fetch_html(sessions_url, headers, f"{self.SITE} technical sessions {year}")
        soup = self._soup(html)

        yy = f"{year % 100:02d}"
        pres_links = []
        for a in soup.select(f'a[href*="/conference/usenixsecurity{yy}/presentation/"]'):
            href = a.get("href")
            if href:
                pres_links.append(urljoin(sessions_url, href))
        pres_links = sorted(set(pres_links))

        # 2) parse each presentation page for title, authors, pdf
        results: List[Dict[str, Any]] = []
        for url in tqdm(pres_links, f"Fetching USENIX {year} papers"):
            try:
                phtml = self._fetch_html(url, headers, "presentation page")
                psoup = self._soup(phtml)

                # title
                h1 = psoup.find("h1")
                title = (h1.get_text(strip=True) if h1 else "").strip()
                if not title:
                    continue

                # authors (prefer meta tags; fallback to BibTeX)
                authors = [m.get("content").strip() for m in psoup.select('meta[name="citation_author"]') if m.get("content")]
                if not authors:
                    # try BibTeX: author = {A and B and C}
                    m = re.search(r"author\s*=\s*\{([^}]+)\}", psoup.get_text("\n", strip=True), flags=re.IGNORECASE | re.DOTALL)
                    if m:
                        authors = [a.strip() for a in m.group(1).split(" and ") if a.strip()]

                # pdf url (skip slides if both exist)
                pdf_url = ""
                for a in psoup.select('a[href$=".pdf"], a[href*=".pdf?"]'):
                    href = a.get("href") or ""
                    if href:
                        cand = urljoin(url, href)
                        name = cand.lower()
                        if "slides" in name or "talk" in name:
                            continue
                        pdf_url = cand
                        break
                if not pdf_url:
                    continue

                results.append({"title": title, "authors": authors, "paper_url": pdf_url})
            except Exception as e:
                logging.warning("Skipping one presentation (%s): %s", url, e)

        return results



In [45]:
year = 2024
fetcher = UsenixSecurityFetcher()
resp = fetcher.fetch(year)
if resp["status"] == "success":
    out = f"usenix_security_{year}.jsonl"
    write_jsonl(out, resp["data"])
    print(resp["message"])
    print(f"Wrote {len(resp)} JSONL to {out}")
else:
    print(resp["message"])

Fetching USENIX 2024 papers:  50%|█████     | 210/418 [01:25<01:24,  2.47it/s]


KeyboardInterrupt: 

In [43]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json
import logging
import re
import sys
from abc import ABC, abstractmethod
from typing import Any, Dict, List
from urllib.parse import urljoin
from tqdm import tqdm


def make_response(status: str, message: str, data: Any) -> Dict[str, Any]:
    return {"status": status, "message": message, "data": data}

def write_jsonl(path: str, records: List[Dict[str, Any]]) -> None:
    with open(path, "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

# ------------------------- Base -------------------------

class BaseFetcher(ABC):
    SITE: str = "Base"
    HEADERS: Dict = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/124.0.0.0 Safari/537.36"
        ),
        "Accept": "application/pdf,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    }

    def fetch(self, year: int) -> Dict[str, Any]:
        """Public API: validate → scrape → wrap in make_response."""
        try:
            y = self._validate_year(year)
            data = self._scrape(y, self.HEADERS)
            if not data:
                return make_response("success", f"No papers found for {self.SITE} {y}.", [])
            return make_response("success", f"Fetched {len(data)} papers from {self.SITE} {y}.", data)
        except Exception as e:
            return make_response("error", f"{self.SITE} error: {e}", None)

    @abstractmethod
    def _scrape(self, year: int, headers: Dict[str, str]) -> List[Dict[str, Any]]:
        ...

    def _validate_year(self, year: int) -> int:
        try:
            y = int(year)
        except Exception:
            raise ValueError("year must be an integer.")
        if not (1900 <= y <= 2100):
            raise ValueError(f"year out of expected range: {y}.")
        return y

    def _fetch_html(self, url: str, headers: Dict[str, str], label: str, timeout: int = 30) -> str:
        try:
            import requests
        except Exception as e:
            raise RuntimeError(f"Missing dependency 'requests': {e}")
        try:
            resp = requests.get(url, headers=headers, allow_redirects=True, timeout=timeout)
        except requests.RequestException as e:
            raise RuntimeError(f"Network error fetching {label}: {e}")
        except Exception as e:
            raise RuntimeError(f"Unexpected error fetching {label}: {e}")
        if resp.status_code != 200:
            raise RuntimeError(f"HTTP {resp.status_code} fetching {url}")
        return resp.text

    def _soup(self, html: str):
        try:
            from bs4 import BeautifulSoup
        except Exception as e:
            raise RuntimeError(f"Missing dependency 'beautifulsoup4': {e}")
        return BeautifulSoup(html, "html.parser")

    def _split_authors(self, text: str) -> List[str]:
        parts = re.split(r",|\band\b", text or "", flags=re.IGNORECASE)
        return [re.sub(r"\s+", " ", p).strip() for p in parts if p and p.strip()]

# ------------------------- DBLP → USENIX Security -------------------------

class DblpUsenixFetcher(BaseFetcher):
    SITE = "DBLP→USENIX Security"
    DBLP_BASE = "https://dblp.org/db/conf/uss"

    def _toc_url(self, year: int) -> str:
        return f"{self.DBLP_BASE}/uss{year}.html"

    def _scrape(self, year: int, headers: Dict[str, str]) -> List[Dict[str, Any]]:
        toc_url = self._toc_url(year)
        toc_html = self._fetch_html(toc_url, headers, f"DBLP TOC {year}")
        soup = self._soup(toc_html)

        # Each paper row is typically a <li class="entry inproceedings"> under ul.publ-list
        items = soup.select("ul.publ-list li.entry.inproceedings")
        if not items:
            # Fallback (DBLP occasionally tweaks classes)
            items = soup.select("li.entry")

        results: List[Dict[str, Any]] = []
        for li in tqdm(items, f"Fetching USENIX {year} papers"):
            try:
                # ---- title
                title_el = li.select_one("span.title")
                title = (title_el.get_text(strip=True) if title_el else "").strip()
                if not title:
                    # Fallback: sometimes title is within <cite>
                    cite = li.find("cite")
                    if cite:
                        title = (cite.get_text(" ", strip=True) or "").strip()
                        # Trim trailing session/venue noise if present
                        title = re.split(r"\.\s{2,}|\s{2,}view\s", title, maxsplit=1)[0].strip()

                if not title:
                    continue

                # ---- authors (schema.org microdata is stable on DBLP)
                author_els = li.select('span[itemprop="author"] span[itemprop="name"]')
                authors = [a.get_text(strip=True) for a in author_els if a.get_text(strip=True)]
                if not authors:
                    # Fallback: collect visible author anchors near the title
                    authors = [a.get_text(strip=True) for a in li.select("a[href*='/pid/']")]
                    if not authors:
                        # Last resort: split the cite text
                        authors = self._split_authors(li.get_text(" ", strip=True))

                # ---- find USENIX landing (electronic edition) then extract PDF
                usenix_url = ""
                for a in li.select("a[href]"):
                    href = a.get("href") or ""
                    if "usenix.org" in href:
                        usenix_url = href
                        break

                if not usenix_url:
                    # Try via DBLP record page
                    rec = next((a.get("href") for a in li.select("a[href^='https://dblp.org/rec/']") if a.get("href")), "")
                    if rec:
                        rec_html = self._fetch_html(rec, headers, "DBLP record")
                        rec_soup = self._soup(rec_html)
                        link = rec_soup.select_one("a[href*='usenix.org']")
                        usenix_url = link.get("href") if link else ""

                if not usenix_url:
                    # If we can’t reach a landing page, skip (DBLP usually has one)
                    continue

                # On the USENIX presentation page, grab a direct PDF link
                pdf_url = ""
                try:
                    page_html = self._fetch_html(usenix_url, headers, "USENIX presentation")
                    page_soup = self._soup(page_html)
                    # Prefer a paper PDF over slides/preprint if both exist
                    pdf_as = page_soup.select("a[href$='.pdf'], a[href*='.pdf?']")
                    # Heuristic: prefer text containing 'Paper'; otherwise first PDF
                    cand = next((a for a in pdf_as if re.search(r"paper", a.get_text("", strip=True), re.I)), None)
                    pdf_url = (cand or (pdf_as[0] if pdf_as else None)).get("href") if (cand or pdf_as) else ""
                    if pdf_url and not pdf_url.startswith("http"):
                        pdf_url = urljoin(usenix_url, pdf_url)
                except Exception:
                    pdf_url = ""

                # Fallback to the USENIX landing if no direct PDF discovered
                paper_url = pdf_url or usenix_url

                results.append({"title": title, "authors": authors, "paper_url": paper_url})
            except Exception as e:
                logging.warning("Skipping an entry: %s", e)

        return results

# ------------------------- CLI -------------------------

if __name__ == "__main__":
    year = 2024
    out_path = sys.argv[2] if len(sys.argv) > 2 else f"usenix_security_{year}.jsonl"
    fetcher = DblpUsenixFetcher()
    resp = fetcher.fetch(year)
    if resp["status"] == "success":
        write_jsonl(out_path, resp["data"])
        print(resp["message"])
        print(f"Wrote JSONL to {out_path}")
    else:
        print(resp["message"])
        sys.exit(1)

Fetching USENIX 2024 papers:  50%|████▉     | 208/418 [01:45<01:46,  1.96it/s]


KeyboardInterrupt: 

In [None]:
class DblpAAAIFetcher(BaseFetcher):
    SITE = "DBLP→AAAI"
    DBLP_BASE = "https://dblp.org/db/conf/aaai"

    def _toc_url(self, year: int) -> str:
        # DBLP yearly TOC, e.g., https://dblp.org/db/conf/aaai/aaai2024.html
        return f"{self.DBLP_BASE}/aaai{year}.html"

    def _scrape(self, year: int, headers: Dict[str, str]) -> List[Dict[str, Any]]:
        toc = self._toc_url(year)
        html = self._fetch_html(toc, headers, f"DBLP AAAI {year} TOC")
        soup = self._soup(html)

        items = soup.select("ul.publ-list li.entry.inproceedings") or soup.select("li.entry")
        results: List[Dict[str, Any]] = []

        for li in tqdm(items, f"Fetching AAAI {year} papers"):
            try:
                # -------- title
                t_el = li.select_one("span.title")
                title = (t_el.get_text(strip=True) if t_el else "").strip()
                if not title:
                    c = li.find("cite")
                    if c:
                        title = (c.get_text(" ", strip=True) or "").strip()
                if not title:
                    continue

                # -------- authors (DBLP microdata)
                a_els = li.select('span[itemprop="author"] span[itemprop="name"]')
                authors = [a.get_text(strip=True) for a in a_els if a.get_text(strip=True)]
                if not authors:
                    # last resort: split nearby text
                    text = li.get_text(" ", strip=True)
                    parts = re.split(r"\s{2,}|\s-\s", text)
                    head = parts[0] if parts else text
                    authors = [re.sub(r"\s+", " ", s).strip() for s in re.split(r",|\band\b", head) if s.strip()]

                # -------- landing link (prefer AAAI OJS)
                hrefs = [a.get("href") or "" for a in li.select("a[href]")]
                landing = next((h for h in hrefs if "ojs.aaai.org/index.php/AAAI/article/view" in h), "")
                if not landing:
                    # open DBLP record to find OJS/DOI if missing on TOC row
                    rec = next((h for h in hrefs if h.startswith("https://dblp.org/rec/")), "")
                    if rec:
                        rec_html = self._fetch_html(rec, headers, "DBLP record")
                        rec_soup = self._soup(rec_html)
                        link = rec_soup.select_one("a[href*='ojs.aaai.org/index.php/AAAI/article/view'], a[href*='doi.org/10.1609/']")
                        landing = link.get("href") if link else ""
                if not landing:
                    # fall back to the first “electronic edition” of any kind
                    landing = next((h for h in hrefs if h.startswith("http")), "")

                # -------- get a direct PDF if available on the landing page
                paper_url = landing
                try:
                    page_html = self._fetch_html(landing, headers, "AAAI landing")
                    page_soup = self._soup(page_html)
                    # OJS typically exposes direct PDFs like .../article/view/<id>/<pdfid>
                    a_pdf = page_soup.select_one("a[href$='.pdf'], a[href*='/article/view/'][href*='/']")
                    if a_pdf:
                        paper_url = urljoin(landing, a_pdf.get("href") or "")
                except Exception:
                    pass  # keep landing URL

                results.append({"title": title, "authors": authors, "paper_url": paper_url})
            except Exception as e:
                logging.warning("Skipping one AAAI entry: %s", e)

        return results

In [None]:
if __name__ == "__main__":
    year = 2023
    out_path = sys.argv[2] if len(sys.argv) > 2 else f"aaai_{year}.jsonl"
    fetcher = DblpAAAIFetcher()
    resp = fetcher.fetch(year)
    if resp["status"] == "success":
        write_jsonl(out_path, resp["data"])
        print(resp["message"])
        print(f"Wrote JSONL to {out_path}")
    else:
        print(resp["message"])
        sys.exit(1)