In [9]:
from langchain_core.tools import ToolException
import requests
from bs4 import BeautifulSoup
import os
from tqdm import tqdm
import json
from typing import Dict, List, Any, Optional


In [None]:
import os
import json
from typing import Optional, List, Dict, Any

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm


def save_paper_info(paper_info: Dict[str, Any], conference: str, year: int) -> str:
    """
    Save a paper's information as a single line in a JSONL file for a given conference and year.

    Args:
        paper_info (Dict[str, Any]): Metadata of the paper (title, authors, abstract, urls).
        conference (str): The conference name (e.g., "neurips").
        year (int): The year of the conference.

    Returns:
        str: The file path where the paper was saved.
    """
    filepath = os.path.join(
        "paper_list", conference.lower(), f"{conference.lower()}{year}.jsonl"
    )
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, "a", encoding="utf-8") as f:
        f.write(json.dumps(paper_info, ensure_ascii=False) + "\n")
    return filepath


def load_paper_list(filepath: str) -> List[Dict[str, Any]]:
    """
    Load a list of paper metadata from a JSONL file. Each line in the file should be a JSON object.

    Args:
        filepath (str): Path to the JSONL file.

    Returns:
        List[Dict[str, Any]]: List of paper metadata dictionaries. Returns empty list if file does not exist.
    """
    if not os.path.exists(filepath):
        return []
    with open(filepath, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]


def paper_matches_topic(paper: Dict[str, Any], topic_keywords: List[str]) -> bool:
    """
    Return True if the paper's title or abstract contains any of the topic keywords (case-insensitive).

    Args:
        paper (Dict[str, Any]): Paper metadata. Must include 'title'; may include 'abstract'.
        topic_keywords (List[str]): List of keywords or phrases to match against title/abstract.

    Returns:
        bool: True if any keyword appears in title or abstract, False otherwise.

    Raises:
        ValueError: If 'title' is missing or empty.
    """
    title = paper.get("title")
    if not title or not title.strip():
        raise ValueError("Paper dictionary must include a non-empty 'title' field.")
    abstract = (paper.get("abstract") or "").lower()
    title_lower = title.lower()
    for kw in topic_keywords:
        kw_lower = kw.lower()
        if kw_lower in title_lower or kw_lower in abstract:
            return True
    return False


def get_neurips_abstract_links(year: int) -> List[str]:
    """
    Retrieve a list of NeurIPS paper abstract URLs for a specified year.

    Args:
        year (int): The year of the NeurIPS conference (e.g., 2023).

    Returns:
        List[str]: A list of full URLs, each pointing to a paper's abstract page.

    Raises:
        requests.RequestException: If the HTTP request fails.
    """
    url = f"https://papers.nips.cc/paper/{year}/"  # trailing slash required
    response = requests.get(url, timeout=20)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    links = soup.find_all("a", href=True)
    return [
        "https://papers.nips.cc" + a["href"]
        for a in links
        if a["href"].endswith("-Abstract.html")
    ]


def parse_neurips_paper_from_html(url: str, html: str) -> Dict[str, Any]:
    """
    Parse metadata from a single NeurIPS paper abstract page HTML.

    Args:
        url (str): The full URL of the paper abstract page.
        html (str): The HTML content of the page.

    Returns:
        Dict[str, Any]: A dictionary containing:
            - title (str)
            - authors (str)
            - abstract (str)
            - url_web (str)
            - url_pdf (str)

    Raises:
        ValueError: If expected HTML elements are missing or multiple PDF links found.
    """
    soup = BeautifulSoup(html, "html.parser")
    try:
        title = soup.find_all("h4")[0].text
        authors = soup.find_all("i")[-1].text
        abstract = soup.find_all("p")[2].text
    except Exception as e:
        raise ValueError(f"Error parsing metadata from {url}: {e}")

    info: Dict[str, Any] = {
        "title": title,
        "authors": authors,
        "abstract": abstract,
        "url_web": url
    }

    pdf_links = [
        tag["href"]
        for tag in soup.find_all("a", href=True)
        if tag["href"].lower().endswith("paper.pdf")
    ]
    if len(pdf_links) != 1:
        raise ValueError(f"Found incorrect pdf url for {url}: {pdf_links}")

    info["url_pdf"] = "https://papers.nips.cc" + pdf_links[0]
    return info


def fetch_single_paper_sync(
    client: requests.Session,
    url: str,
    keywords: Optional[List[str]]
) -> Optional[Dict[str, Any]]:
    """
    Fetch and parse a single paper synchronously using an existing HTTP session.

    Args:
        client (requests.Session): A configured HTTP session.
        url (str): The paper abstract URL to fetch.
        keywords (Optional[List[str]]): If provided, only return metadata if it matches any keyword.

    Returns:
        Optional[Dict[str, Any]]: Paper metadata dict on success and matching filter, else None.
    """
    try:
        response = client.get(url, timeout=20)
        response.raise_for_status()
        meta = parse_neurips_paper_from_html(url, response.text)
        if not keywords or paper_matches_topic(meta, keywords):
            return meta
    except Exception as e:
        print(f"Error parsing {url}: {e}")
    return None


def fetch_neurips_sync(
    year: int,
    max_papers: Optional[int] = None,
    keywords: Optional[List[str]] = None
) -> str:
    """
    Fetch NeurIPS papers' metadata for a year synchronously and save new entries to JSONL.

    Steps:
        1. Load existing papers to dedupe.
        2. Scrape the list of abstract URLs.
        3. Sequentially fetch & parse each paper with a progress bar.
        4. Append only new papers to the JSONL file.

    Args:
        year (int): Year of the NeurIPS conference.
        max_papers (Optional[int]): Maximum number of papers to process. Processes all if None.
        keywords (Optional[List[str]]): If provided, only save papers containing these keywords.

    Returns:
        str: File path to the JSONL file with saved paper info.
    """
    conference = "neurips"
    filepath = os.path.join(
        "paper_list", conference, f"{conference}{year}.jsonl"
    )
    os.makedirs(os.path.dirname(filepath), exist_ok=True)

    # 1) Load existing papers and dedupe by URL
    existing = load_paper_list(filepath)
    seen_urls = {p["url_web"] for p in existing if "url_web" in p}
    print(f">>> Already have {len(seen_urls)} papers saved.")

    # 2) Get all abstract URLs
    paper_urls = get_neurips_abstract_links(year)
    if max_papers:
        paper_urls = paper_urls[:max_papers]
    print(f">>> Found {len(paper_urls)} candidate URLs.")

    # 3) Fetch & filter
    new_papers: List[Dict[str, Any]] = []
    with requests.Session() as client:
        for url in tqdm(paper_urls, desc=f"Parsing NeurIPS {year} abstracts"):
            meta = fetch_single_paper_sync(client, url, keywords)
            if meta and meta["url_web"] not in seen_urls:
                new_papers.append(meta)
                seen_urls.add(meta["url_web"])

    print(f">>> {len(new_papers)} new papers to save.")

    # 4) Append new ones
    with open(filepath, "a", encoding="utf-8") as f:
        for paper in new_papers:
            f.write(json.dumps(paper, ensure_ascii=False) + "\n")
    print(f">>> Appended {len(new_papers)} papers to {filepath}")

    return filepath


In [None]:
import asyncio
import os
import json
from typing import Optional, List, Dict, Any
import httpx
from bs4 import BeautifulSoup
from tqdm.asyncio import tqdm_asyncio

def save_paper_info(paper_info: Dict[str, Any], conference: str, year: int) -> str:
    filepath = os.path.join(
        "paper_list", conference.lower(), f"{conference.lower()}{year}.jsonl"
    )
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, "a", encoding="utf-8") as f:
        f.write(json.dumps(paper_info, ensure_ascii=False) + "\n")
    return filepath

def load_paper_list(filepath: str) -> List[Dict[str, Any]]:
    if not os.path.exists(filepath):
        return []
    with open(filepath, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

def paper_matches_topic(paper: Dict[str, Any], topic_keywords: List[str]) -> bool:
    title = paper.get("title")
    if not title or not title.strip():
        raise ValueError("Paper dictionary must include a non-empty 'title' field.")
    abstract = (paper.get("abstract") or "").lower()
    title = title.lower()
    keywords_lower = [kw.lower() for kw in topic_keywords]
    return any(kw in title or kw in abstract for kw in keywords_lower)

PRIVACY_TOPICS = [
    "privacy", "private", "confidential", "anonymity", "anonymization",
    "data protection", "secure", "secrecy", "obfuscation", "de-identification"
]

async def get_neurips_abstract_links(year: int) -> List[str]:
    url = f"https://papers.nips.cc/paper/{year}"
    async with httpx.AsyncClient(timeout=20) as client:
        response = await client.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        links = soup.find_all("a")
        paper_urls = [
            "https://papers.nips.cc" + link["href"]
            for link in links if "-Abstract.html" in link.get("href", "")
        ]
        return paper_urls

def parse_neurips_paper_from_html(url: str, html: str) -> Dict[str, Any]:
    soup = BeautifulSoup(html, "html.parser")
    try:
        title = soup.find_all("h4")[0].text
        authors = soup.find_all("i")[-1].text
        abstract = soup.find_all("p")[2].text
    except Exception as e:
        raise ValueError(f"Error parsing metadata from {url}: {e}")
    info = {
        "title": title,
        "authors": authors,
        "abstract": abstract,
        "url_web": url
    }
    pdf_url = [
        tag['href'] for tag in soup.find_all('a', href=True)
        if tag['href'].lower().endswith('paper.pdf')
    ]
    if len(pdf_url) != 1:
        raise ValueError(f"Found incorrect pdf url for {url}: {pdf_url}")
    info["url_pdf"] = "https://papers.nips.cc" + pdf_url[0]
    return info

async def fetch_single_paper(
    client: httpx.AsyncClient,
    url: str,
    keywords: Optional[List[str]]
) -> Optional[Dict[str, Any]]:
    """
    Fetch and parse a single paper using the given AsyncClient.
    """
    try:
        r = await client.get(url)
        r.raise_for_status()
        meta = parse_neurips_paper_from_html(url, r.text)
        if not keywords or paper_matches_topic(meta, keywords):
            return meta
    except Exception as e:
        print(f"Error parsing {url}: {e}")
    return None

async def fetch_neurips(
    year: int,
    max_papers: Optional[int] = None,
    keywords: Optional[List[str]] = None
) -> str:
    conference = "neurips"
    filepath = os.path.join("paper_list", conference, f"{conference}{year}.jsonl")
    os.makedirs(os.path.dirname(filepath), exist_ok=True)

    # 1) Load existing papers and build a set of seen URLs
    existing = load_paper_list(filepath)
    seen_urls = {paper.get("url_web") for paper in existing if "url_web" in paper}
    print(f">>> Already have {len(seen_urls)} papers saved.")

    # 2) Scrape all paper URLs
    paper_urls = await get_neurips_abstract_links(year)
    if max_papers:
        paper_urls = paper_urls[:max_papers]
    print(f">>> Found {len(paper_urls)} candidate URLs.")

    # 3) Fetch and filter only new papers
    new_papers = []
    async with httpx.AsyncClient(timeout=20) as client:
        tasks = [fetch_single_paper(client, url, keywords) for url in paper_urls]
        for coro in tqdm_asyncio.as_completed(tasks, total=len(tasks)):
            meta = await coro
            if meta and meta["url_web"] not in seen_urls:
                new_papers.append(meta)
                seen_urls.add(meta["url_web"])

    print(f">>> {len(new_papers)} new papers to save.")

    # 4) Save only the new ones
    with open(filepath, "a", encoding="utf-8") as f:
        for paper in new_papers:
            f.write(json.dumps(paper, ensure_ascii=False) + "\n")
    print(f">>> Appended {len(new_papers)} papers to {filepath}")

    return filepath


# Example usage:
import nest_asyncio
nest_asyncio.apply()

import asyncio
await fetch_neurips(2021)


>>> Found 2334 URLs for NeurIPS 2021


Parsing abstracts:   0%|          | 0/2334 [00:00<?, ?it/s]

In [None]:

def save_paper_info(paper_info: Dict[str, Any], conference: str, year: int) -> str:
    """
    Save a paper's information as a single line in a JSONL file for a given conference and year.
    
    Args:
        paper_info (Dict[str, Any]): Metadata of the paper (title, authors, etc.).
        conference (str): The conference name (e.g., "neurips").
        year (int): The year of the conference.
    
    Returns:
        str: The file path where the paper was saved.
    """
    filepath = os.path.join(
        "paper_list", conference.lower(), f"{conference.lower()}{year}.jsonl"
    )
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, "a", encoding="utf-8") as f:
        f.write(json.dumps(paper_info, ensure_ascii=False) + "\n")
    return filepath

def load_paper_list(filepath: str) -> List[Dict[str, Any]]:
    """
    Load a list of paper metadata from a JSONL file. Each line in the file should be a JSON object.
    
    Args:
        filepath (str): Path to the JSONL file.
    
    Returns:
        List[Dict[str, Any]]: List of paper metadata dictionaries. Returns empty list if file does not exist.
    """
    if not os.path.exists(filepath):
        return []
    with open(filepath, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]
    
# def get_paper_titles(paper_list: List[Dict[str, Any]]) -> List[str]:

    
def paper_matches_topic(paper: Dict[str, Any], topic_keywords: List[str]) -> bool:
    """
    Return True if the paper's title or (optional) abstract contains any of the topic keywords (case-insensitive).
    Title is required; raises ValueError if missing or empty.

    Args:
        paper (Dict[str, Any]): Paper metadata. Must have 'title'. May have 'abstract'.
        topic_keywords (List[str]): Keywords or phrases for topic matching.

    Returns:
        bool: True if the paper matches any keyword, False otherwise.

    Raises:
        ValueError: If 'title' is missing or empty.
    """
    title = paper.get("title")
    if not title or not title.strip():
        raise ValueError("Paper dictionary must include a non-empty 'title' field.")
    abstract = (paper.get("abstract") or "").lower()
    title = title.lower()
    keywords_lower = [kw.lower() for kw in topic_keywords]
    return any(kw in title or kw in abstract for kw in keywords_lower)

PRIVACY_TOPICS = [
    "privacy", "private", "confidential", "anonymity", "anonymization",
    "data protection", "secure", "secrecy", "obfuscation", "de-identification"
]

In [3]:
from typing import Optional, List, Dict, Any
import os
import json
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from langchain.tools import BaseTool


class SavePaperInfoTool(BaseTool):
    """
    Tool that saves a paper's information to a JSONL file for a given conference and year.
    """
    name: str = "save_paper_info"
    description: str = (
        "Save paper metadata (title, authors, abstract, urls) to a JSONL file under paper_list/<conference>/<conference><year>.jsonl."
    )

    def _run(
        self, paper_info: Dict[str, Any], conference: str, year: int
    ) -> str:
        filepath = os.path.join(
            "paper_list", conference.lower(), f"{conference.lower()}{year}.jsonl"
        )
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        with open(filepath, "a", encoding="utf-8") as f:
            f.write(json.dumps(paper_info, ensure_ascii=False) + "\n")
        return filepath

    async def _arun(
        self, paper_info: Dict[str, Any], conference: str, year: int
    ) -> str:
        return self._run(paper_info, conference, year)


class LoadPaperListTool(BaseTool):
    """
    Tool that loads paper metadata list for a given conference and year.
    """
    name: str = "load_paper_list"
    description: str = (
        "Load paper metadata from the JSONL file saved by save_paper_info for a given conference and year."
    )

    def _run(self, conference: str, year: int) -> List[Dict[str, Any]]:
        filepath = os.path.join(
            "paper_list", conference.lower(), f"{conference.lower()}{year}.jsonl"
        )
        if not os.path.exists(filepath):
            return []
        with open(filepath, "r", encoding="utf-8") as f:
            return [json.loads(line) for line in f]

    async def _arun(self, conference: str, year: int) -> List[Dict[str, Any]]:
        return self._run(conference, year)


class PaperMatchesTopicTool(BaseTool):
    """
    Tool that checks if a paper's title or abstract contains any given topic keywords.
    """
    name: str = "paper_matches_topic"
    description: str = (
        "Return True if any keyword appears in the paper's title or abstract (case-insensitive)."
    )

    def _run(self, paper: Dict[str, Any], topic_keywords: List[str]) -> bool:
        title = paper.get("title")
        if not title or not title.strip():
            raise ValueError("Paper dictionary must include a non-empty 'title' field.")
        abstract = (paper.get("abstract") or "").lower()
        title_lower = title.lower()
        for kw in topic_keywords:
            if kw.lower() in title_lower or kw.lower() in abstract:
                return True
        return False

    async def _arun(self, paper: Dict[str, Any], topic_keywords: List[str]) -> bool:
        return self._run(paper, topic_keywords)


class GetNeuripsAbstractLinksTool(BaseTool):
    """
    Tool that retrieves NeurIPS paper abstract URLs for a specified year.
    """
    name: str = "get_neurips_abstract_links"
    description: str = (
        "Fetch the list of NeurIPS paper abstract URLs for a given conference year."
    )

    def _run(self, year: int) -> List[str]:
        url = f"https://papers.nips.cc/paper/{year}/"
        response = requests.get(url, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        links = soup.find_all("a", href=True)
        return [
            "https://papers.nips.cc" + a["href"]
            for a in links
            if a["href"].endswith("-Abstract.html")
        ]

    async def _arun(self, year: int) -> List[str]:
        return self._run(year)


class ParseNeuripsPaperTool(BaseTool):
    """
    Tool that parses metadata from a NeurIPS paper abstract page HTML.
    """
    name: str = "parse_neurips_paper"
    description: str = (
        "Parse a NeurIPS abstract page HTML and extract title, authors, abstract, web URL, and PDF URL."
    )

    def _run(self, url: str, html: str) -> Dict[str, Any]:
        soup = BeautifulSoup(html, "html.parser")
        try:
            title = soup.find_all("h4")[0].text
            authors = soup.find_all("i")[-1].text
            abstract = soup.find_all("p")[2].text
        except Exception as e:
            raise ValueError(f"Error parsing metadata from {url}: {e}")

        info: Dict[str, Any] = {
            "title": title,
            "authors": authors,
            "abstract": abstract,
            "url_web": url
        }
        pdf_links = [
            tag["href"] for tag in soup.find_all("a", href=True)
            if tag["href"].lower().endsWith("paper.pdf")
        ]
        if len(pdf_links) != 1:
            raise ValueError(f"Found incorrect pdf url for {url}: {pdf_links}")
        info["url_pdf"] = "https://papers.nips.cc" + pdf_links[0]
        return info

    async def _arun(self, url: str, html: str) -> Dict[str, Any]:
        return self._run(url, html)


class FetchSinglePaperSyncTool(BaseTool):
    """
    Tool that fetches and parses a single NeurIPS paper using an HTTP session, filtering by keywords.
    """
    name: str = "fetch_single_paper_sync"
    description: str = (
        "Fetch a NeurIPS paper abstract page synchronously and return metadata if it matches keywords."
    )

    def _run(
        self, client: requests.Session, url: str, keywords: Optional[List[str]] = None
    ) -> Optional[Dict[str, Any]]:
        try:
            response = client.get(url, timeout=20)
            response.raise_for_status()
            meta = ParseNeuripsPaperTool().run(url, response.text)
            if not keywords or PaperMatchesTopicTool().run(meta, keywords):
                return meta
        except Exception as e:
            print(f"Error parsing {url}: {e}")
        return None

    async def _arun(
        self, client: requests.Session, url: str, keywords: Optional[List[str]] = None
    ) -> Optional[Dict[str, Any]]:
        return self._run(client, url, keywords)


class FetchNeuripsSyncTool(BaseTool):
    """
    Tool that fetches and saves NeurIPS papers for a given year, with optional filtering and deduplication.
    """
    name: str = "fetch_neurips_sync"
    description: str = (
        "Fetch all NeurIPS abstracts for a year, filter by keywords, dedupe existing, and save new to JSONL."
    )

    def _run(
        self, year: int, max_papers: Optional[int] = None, keywords: Optional[List[str]] = None
    ) -> str:
        conference = "neurips"
        filepath = os.path.join(
            "paper_list", conference, f"{conference}{year}.jsonl"
        )
        os.makedirs(os.path.dirname(filepath), exist_ok=True)

        existing = LoadPaperListTool().run(conference, year)
        seen_urls = {p["url_web"] for p in existing if "url_web" in p}

        paper_urls = GetNeuripsAbstractLinksTool().run(year)
        if max_papers:
            paper_urls = paper_urls[:max_papers]

        new_papers: List[Dict[str, Any]] = []
        with requests.Session() as client:
            for url in tqdm(paper_urls, desc=f"Parsing NeurIPS {year} abstracts"):
                meta = FetchSinglePaperSyncTool().run(client, url, keywords)
                if meta and meta["url_web"] not in seen_urls:
                    new_papers.append(meta)
                    seen_urls.add(meta["url_web"])

        with open(filepath, "a", encoding="utf-8") as f:
            for paper in new_papers:
                f.write(json.dumps(paper, ensure_ascii=False) + "\n")

        return filepath

    async def _arun(
        self, year: int, max_papers: Optional[int] = None, keywords: Optional[List[str]] = None
    ) -> str:
        return self._run(year, max_papers, keywords)

# Instantiate the tools list for use
langchain_tools = [
    SavePaperInfoTool(),
    LoadPaperListTool(),
    PaperMatchesTopicTool(),
    GetNeuripsAbstractLinksTool(),
    ParseNeuripsPaperTool(),
    FetchSinglePaperSyncTool(),
    FetchNeuripsSyncTool(),
]


In [1]:
from langchain.agents import initialize_agent, AgentType
from configs import config
from configs.llm_provider import get_llm
from research_trend_analyzer.tools.paper_fetch_tools import toolkit

# Initialize the LLM (adjust parameters as needed)
config.LLM_PROVIDER='gemini'
llm = get_llm(config)

# Create the agent with the tools
paper_fetch_agent = initialize_agent(
    tools=toolkit,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Authentication error: Langfuse client initialized without public_key. Client will be disabled. Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. 
  paper_fetch_agent = initialize_agent(


In [2]:
result = paper_fetch_agent.run(
    "find privacy related papers from NeurIPS 2020."
)
print(result)

  result = paper_fetch_agent.run(




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
```json
{
  "action": "generate_keyword_list_given_topic",
  "action_input": {
    "topic": "privacy"
  }
}
```[0m
Observation: [36;1m[1;3m{'error': 'Could not parse LLM output: invalid syntax (<unknown>, line 1)', 'raw_response': '```python\n{"topic": "privacy", "keywords": ["privacy", "private", "anonymity", "anonymization", "data protection", "GDPR", "differential privacy", "federated learning", "privacy-preserving machine learning", "secure multi-party computation", "homomorphic encryption", "data minimization", "consent", "confidentiality", "personal data", "privacy-enhancing technologies", "re-identification", "surveillance", "digital rights"]}\n```', 'topic': 'privacy'}[0m
Thought:[32;1m[1;3mAction:
```json
{
  "action": "filter_paper_by_topic",
  "action_input": {
    "conference": "NeurIPS",
    "year": 2020,
    "topic_keywords": {
      "topic": "privacy",
      "keywords": [
        "privacy",
      

In [31]:
import requests
from bs4 import BeautifulSoup

year=2024
url = f"https://dblp.org/db/conf/uss/uss2024.html"
response = requests.get(url, timeout=20)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")

## Build Paper Summarizer Agent

In [None]:
prompt_content = """
# Paper Summary Instruction

You are a scientific assistant that reads academic papers and provides structured, in-depth summaries.  
Given the text of the following research paper, summarize it in clear, concise language, focusing on the following aspects (use bullet points or numbered lists for clarity):

### 0. Key Words
- Provide 5 key technology terms that best describe the paper
- Keywords should be nouns, from field of AI / NLP / trustworthiness (privacy, safety, fairness, hallucination, etc.) / etc. 

### 1. Motivation (Research Background)
- Briefly describe the background and motivation for the research.
- What is the main problem or challenge being addressed?

### 2. State-of-the-Art Methods and Their Limitations
- Summarize the current state-of-the-art approaches related to this problem.
- What are the key limitations or shortcomings of existing methods that the paper aims to overcome?

### 3. Proposed Method (Main Contribution, Main Idea, Highlights, and Novelty)
- Clearly state the main contribution(s) of the paper.
- Describe the core idea and highlights of the proposed method.
- Emphasize what is novel or unique about the approach.

### 4. Experiment Results
- Summarize the experimental setup, including datasets, metrics, and baselines.
- What were the main results and findings? How does the proposed method compare to baselines?

### 5. Limitation and Future Work
- Point out any limitations or open questions discussed in the paper.
- Summarize suggested directions for future research.

---

## Instructions
- Present the summary in well-organized sections corresponding to the points above.
- Avoid copying text directly from the paper; paraphrase and synthesize the information.
- Keep the language accessible to someone with a technical background but who may not be an expert in the specific subfield.

---

## Example Output Structure

```text
0. Key word (5 technology terms best describe the paper)
   - ...

1. Motivation (Research Background):
   - ...

2. State-of-the-Art Methods and Their Limitations:
   - ...

3. Proposed Method (Main Contribution, Main Idea, Highlights, and Novelty):
   - ...

4. Experiment Results:
   - ...

5. Limitation and Future Work:
   - ...

## Given Paper
{context}

## Summary
"""

In [2]:
from langchain.document_loaders import PyMuPDFLoader

extending_context_window_llama_3 = "https://arxiv.org/pdf/2404.19553"
attention_is_all_you_need = "https://arxiv.org/pdf/1706.03762"

docs = PyMuPDFLoader(extending_context_window_llama_3).load()

In [7]:
content = ''
for doc in docs:
    content += doc.page_content

In [3]:
from langchain.agents import initialize_agent, AgentType
from configs import config
from configs.llm_provider import get_llm
from tools.paper_fetch_tools import paper_fetch_toolkit

# Initialize the LLM (adjust parameters as needed)
config.LLM_PROVIDER='gemini'
llm = get_llm(config)

Authentication error: Langfuse client initialized without public_key. Client will be disabled. Provide a public_key parameter or set LANGFUSE_PUBLIC_KEY environment variable. 


In [4]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template(prompt_content)
output_parser = StrOutputParser()
chain = prompt | llm | output_parser

NameError: name 'prompt_content' is not defined

In [15]:
output = chain.invoke({"paper_content": content})

In [16]:
print(output)

Here's a summary of the provided research paper:

### 1. Motivation (Research Background)
The rapid advancements in large language models (LLMs) have led to increased interest in extending their context windows to handle longer inputs. While various methods have been proposed to achieve long-context capabilities, a significant challenge remains: most existing approaches demand substantial computational resources and time, making them inaccessible or inefficient for many researchers and practitioners. The paper aims to address this by developing an efficient and resource-friendly solution for extending LLM context lengths.

### 2. State-of-the-Art Methods and Their Limitations
The paper acknowledges that several approaches have been developed to enable long-context capabilities in LLMs. However, it highlights a critical limitation: these existing methods generally "require significant compute and resources to accomplish." This implies that they are often computationally expensive, time-

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 50,
)

split_chunks = text_splitter.split_documents(docs)

In [3]:
split_chunks

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-05-01T00:39:31+00:00', 'source': 'https://arxiv.org/pdf/2404.19553', 'file_path': 'https://arxiv.org/pdf/2404.19553', 'total_pages': 5, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-05-01T00:39:31+00:00', 'trapped': '', 'modDate': 'D:20240501003931Z', 'creationDate': 'D:20240501003931Z', 'page': 0}, page_content='Extending Llama-3’s Context Ten-Fold Overnight\nPeitian Zhang1,2, Ninglu Shao1,2, Zheng Liu1∗, Shitao Xiao1, Hongjin Qian1,2,\nQiwei Ye1, Zhicheng Dou2\n1 Beijing Academy of Artificial Intelligence'),
 Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-05-01T00:39:31+00:00', 'source': 'https://arxiv.org/pdf/2404.19553', 'file_path': 'https://arxiv.org/pdf/2404.19553', 'total_pages': 5, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddat

In [None]:
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
embedding_fn  = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# documents = [Document(page_content=chunk, metadata={"chunk_id": idx}) for idx, chunk in enumerate(split_chunks)]
# Save to local Chroma DB directory (e.g., ./chroma_db)
vectorstore = FAISS.from_documents(
    documents=docs[:2],
    embedding=embedding_fn,
    # persist_directory="./chroma_db"
)
vectorstore.persist() 

: 

In [28]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(docs))]
vs.add_documents(documents=docs, ids=uuids)




: 

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "BAAI/bge-m3"
embedding = HuggingFaceEmbeddings(
                model_name=model_name,
                model_kwargs={"device": "cpu",
                              "trust_remote_code": True}, # 可改为 "cuda" 用GPU
)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
F