### Setting for Colab

In [1]:
from google.colab import drive
drive.mount('/content/drive')


KeyboardInterrupt



In [2]:
cd /content/drive/MyDrive/Projects/kubig19th-conference-llm/ym

/content/drive/MyDrive/Projects/kubig19th-conference-llm/ym


In [25]:
!pip install python-dotenv semanticscholar langchain langchain_openai langchain_core langchainhub langchain-community

Successfully installed dataclasses-json-0.6.7 langchain-community-0.2.6 marshmallow-3.21.3 mypy-extensions-1.0.0 typing-inspect-0.9.0


In [None]:
!pip install feedparser PyPDF2 beautifulsoup4 requests scikit-learn

### Import Modules

In [1]:
import warnings
warnings.filterwarnings('ignore')
import torch
import os
from pprint import pprint
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from dotenv import load_dotenv

from langchain.pydantic_v1 import BaseModel, Field
from langchain.tools import BaseTool, StructuredTool, tool

from langchain_core.utils.function_calling import convert_to_openai_function
from langchain_openai import ChatOpenAI

from langchain import hub
from langchain.agents import AgentExecutor, create_openai_tools_agent

In [2]:
import semantic_scholoar_api as ss


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /c1/yumin/.cache/huggingface/token
Login successful


In [3]:
from getpaper import GetPaper

In [4]:
dotenv_path = '.env'
load_dotenv(dotenv_path)
openai_api = os.getenv("OPENAI_API_KEY")
ss_api_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

### Define Tools

In [5]:
getpapermodule = GetPaper(ss_api_key, ar5iv_mode = True, path_db = './papers_db', page_limit = 5)

In [6]:
class load_paper_input(BaseModel):
    title: str = Field(description="target paper title")
    sections: list = Field(description='list of sections', default = None)

loadpaper = StructuredTool.from_function(
    func=getpapermodule.load_paper,
    name="loadpaper",
    description="The `loadPaper` tool is designed to facilitate the process of retrieving and reading academic papers based on a given search title. \
    The `title` parameter is a string representing the title of the paper. The 'sections' parameter is a list representing the list of the sections in the paper. \
    If the sections parameter is none, you can get the section list of the paper. If the sections parameter get the section list, you can load the paper's content. \
    Use this tool several times to get the section first and then get the detail content of each section",
    args_schema=load_paper_input
)

In [7]:
class recommend_reference_input(BaseModel):
    query: str = Field(description="target paper title")

recommend_reference = StructuredTool.from_function(
    func=ss.reference_recommend,
    name="recommend_reference",
    description="The reference_recommend function recommends relevant academic papers based on a given query, focusing on papers that the target paper's references. This tool is ideal for researchers and academics looking to find related literature that has been directly cited by the target paper.",
    args_schema=recommend_reference_input
)

In [8]:
class citation_recommend_input(BaseModel):
    query: str = Field(description="target paper title")

recommend_citation = StructuredTool.from_function(
    func=ss.citation_recommend,
    name="recommend_citation",
    description="The recommend_citation function identifies and recommends subsequent papers **that have cited a given target paper**, providing valuable insights into the evolution and impact of the research. This tool helps researchers discover influential follow-up studies and stay updated with the latest developments in their field.",
    args_schema=citation_recommend_input
)

### Make Agent with GPT3.5

In [9]:
model = ChatOpenAI(model="gpt-3.5-turbo")

In [10]:
tools = [loadpaper, recommend_reference, recommend_citation]
# functions = [convert_to_openai_function(t) for t in tools]
# functions[1]

In [11]:
# load Agent prompt
# TODO : Write the better prompt for our agent
prompt = hub.pull("hwchase17/openai-tools-agent")

In [12]:
# Choose the LLM that will drive the agent
# Only certain models support this
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

# Construct the OpenAI Tools agent
agent = create_openai_tools_agent(llm, tools, prompt)
# Create an agent executor by passing in the agent and tools
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

In [23]:
# If you want the model to see the specific page or specific content in the paper, mention the page in the prompt.

output = agent_executor.invoke({"input": "explain about the math expression about the Filtering api calls in the paper 'ToolFormer : Language Models Can Teach Themselves to Use Tools'"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `loadpaper` with `{'title': 'ToolFormer : Language Models Can Teach Themselves to Use Tools'}`


[0m[36;1m[1;3mHere is the title and section of the paper
title
Toolformer: Language Models Can Teach Themselves to Use Tools
sections
          Abstract
  1 Introduction
  2 Approach
        Sampling API Calls
        Executing API Calls
        Filtering API Calls
        Model Finetuning
        Inference
  3 Tools
        Question Answering
        Calculator
        Wikipedia Search
        Machine Translation System
        Calendar
  4 Experiments
    4.1 Experimental Setup
        Dataset Generation
        Model Finetuning
        Baseline Models
    4.2 Downstream Tasks
      4.2.1 LAMA
      4.2.2 Math Datasets
      4.2.3 Question Answering
      4.2.4 Multilingual Question Answering
      4.2.5 Temporal Datasets
    4.3 Language Modeling
    4.4 Scaling Laws
  5 Analysis
        Decoding Strategy
        

In [36]:
pprint(output['output'])

('The paper "Language Models Can Teach Themselves to Use Tools" discusses the '
 'filtering of API calls in the context of language models. Here is an '
 'explanation of the math expression related to filtering API calls as '
 'presented in the paper:\n'
 '\n'
 'The paper introduces a weighted cross-entropy loss function for a language '
 'model M over a sequence of tokens xi,…,xn, where i represents the position '
 'of the API call ci in the sequence and ri is the response from the API. The '
 'loss function is defined as:\n'
 '\n'
 'Li(𝐳) = -∑j=iⁿ wj-i * log pM(xj∣𝐳,x1:j-1)\n'
 '\n'
 'In this expression:\n'
 '- Li(𝐳) is the weighted cross-entropy loss for model M over the tokens if '
 'the model is prefixed with 𝐳.\n'
 '- wj-i represents a sequence of weights.\n'
 '- pM(xj∣𝐳,x1:j-1) is the conditional probability of token xj given the '
 'prefix 𝐳 and the tokens x1:j-1.\n'
 '\n'
 'The paper compares two instantiations of this loss:\n'
 '1. Li+ = Li(e(ci,ri)): Weighted loss over all t

In [37]:
output = agent_executor.invoke({"input": "논문 'Language Models Can Teach Themselves to Use Tools'와 비슷한 후속 논문을 추천해주고, 해당 논문의 abstract가 내가 준 논문과 어떤 차이가 있는지 알려줘"})
# TODO : we do not need paperID in this tool.



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `recommend_citation` with `{'query': 'Language Models Can Teach Themselves to Use Tools'}`


[0m[38;5;200m[1;3m[{'paperId': '7d8905a1fd288068f12c8347caeabefd36d0dd6c', 'title': 'Gorilla: Large Language Model Connected with Massive APIs', 'abstract': "Large Language Models (LLMs) have seen an impressive wave of advances recently, with models now excelling in a variety of tasks, such as mathematical reasoning and program synthesis. However, their potential to effectively use tools via API calls remains unfulfilled. This is a challenging task even for today's state-of-the-art LLMs such as GPT-4, largely due to their inability to generate accurate input arguments and their tendency to hallucinate the wrong usage of an API call. We release Gorilla, a finetuned LLaMA-based model that surpasses the performance of GPT-4 on writing API calls. When combined with a document retriever, Gorilla demonstrates a strong capabili

In [None]:
pprint(output['output'])

('Here are some recommended papers related to "Language Models Can Teach '
 'Themselves to Use Tools":\n'
 '\n'
 '1. **Gorilla: Large Language Model Connected with Massive APIs**\n'
 '   - Abstract: Large Language Models (LLMs) have seen significant '
 'advancements recently, excelling in various tasks. Gorilla, a finetuned '
 'LLaMA-based model, surpasses the performance of GPT-4 in writing API calls. '
 'It demonstrates a strong capability to adapt to test-time document changes '
 'and mitigate hallucination issues.\n'
 '   - Publication Date: May 24, 2023\n'
 '\n'
 '2. **Mind2Web: Towards a Generalist Agent for the Web**\n'
 '   - Abstract: Introduces Mind2Web, a dataset for developing generalist '
 'agents for the web. It provides diverse tasks from real-world websites and '
 'explores using large language models for building generalist web agents.\n'
 '   - Publication Date: June 9, 2023\n'
 '\n'
 '3. **ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world '
 'AP

In [30]:
# TODO : Change Arxiv Search enging to Semantic Scholar
# TODO : error handling when the tool called.
# Because below code doesn't work. with the difference of 'Gorilla: '
# output = agent_executor.invoke({"input": "summary the abstract of the paper 'Gorilla: Large Language Model Connected with Massive APIs'"})

output = agent_executor.invoke({"input": "summary the abstract of the paper 'Large Language Model Connected with Massive APIs'"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `loadpaper` with `{'title': 'Large Language Model Connected with Massive APIs'}`


[0m[36;1m[1;3mGorilla: Large Language Model Connected with Massive APIs
          Abstract
  1 Introduction
  2 Related Work
        Large Language Models
        Tool Usage
        LLMs for Program Synthesis
  3 Methodology
    3.1 Dataset Collection
        API Documentation
        Instruction Generation
    3.2 Gorilla
        API Call with Constraints
        Retriever-Aware training
        Gorilla Inference
    3.3 Verifying APIs
        AST Sub-Tree Matching
  4 Evaluation
        Baselines
        Retrievers
    4.1 AST Accuracy on API call
        Finetuning without Retrieval
        Finetuning with Retrieval
        Hallucination with LLM
    4.2 Test-Time Documentation Change
    4.3 API Call with Constraints
  5 Conclusion
  6 Limitations & Social Impacts
  7 Acknowledgement
  References
  8 Appendix
    8.1 Dataset D

In [None]:
pprint(output['output'])

('The paper titled "Large Language Model Connected with Massive APIs" '
 'introduces Gorilla, a finetuned LLaMA-based model that surpasses the '
 'performance of GPT-4 in writing API calls. Gorilla, when combined with a '
 'document retriever, demonstrates the ability to adapt to test-time document '
 'changes, reducing hallucination errors commonly encountered with LLMs. The '
 'paper introduces APIBench, a dataset consisting of HuggingFace, TorchHub, '
 "and TensorHub APIs, to evaluate the model's ability accurately. Gorilla "
 'significantly outperforms GPT-4 in terms of API functionality accuracy and '
 'reduces hallucination errors. The paper emphasizes the importance of '
 'empowering LLMs to use tools via API calls to access vast knowledge bases '
 'and accomplish complex computational tasks effectively.')


In [None]:
output = agent_executor.invoke({"input": "recommend the reference works of the paper 'Language Models Can Teach Themselves to Use Tools'"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `recommend_reference` with `{'query': 'Language Models Can Teach Themselves to Use Tools'}`


[0m[33;1m[1;3m[{'paperId': '90abbc2cf38462b954ae1b772fac9532e2ccd8b0', 'title': 'Language Models are Few-Shot Learners', 'abstract': "Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. S

In [None]:
pprint(output['output'])

('Here are some recommended reference works related to the paper "Language '
 'Models Can Teach Themselves to Use Tools":\n'
 '\n'
 '1. **Paper Title:** Language Models are Few-Shot Learners\n'
 '   - **Abstract:** Recent work has demonstrated substantial gains on many '
 'NLP tasks and benchmarks by pre-training on a large corpus of text followed '
 'by fine-tuning on a specific task. This paper shows that scaling up language '
 'models greatly improves task-agnostic, few-shot performance, sometimes even '
 'reaching competitiveness with prior state-of-the-art fine-tuning '
 'approaches.\n'
 '   - **Publication Date:** 2020-05-28\n'
 '\n'
 '2. **Paper Title:** PaLM: Scaling Language Modeling with Pathways\n'
 '   - **Abstract:** This paper explores the impact of scale on few-shot '
 'learning by training a 540-billion parameter, densely activated, Transformer '
 'language model called Pathways Language Model PaLM. It achieves breakthrough '
 'performance on language understanding and 

### Memory in Agent

In [13]:
from langchain import hub
from langchain.agents import AgentExecutor, create_react_agent
from langchain.memory import ChatMessageHistory

# prompt = hub.pull("hwchase17/react")
prompt = hub.pull("hwchase17/openai-tools-agent")
memory = ChatMessageHistory(session_id="test-session")

In [14]:
memory

InMemoryChatMessageHistory(messages=[])

In [15]:
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import OpenAI

agent_with_chat_history = RunnableWithMessageHistory(
    agent_executor,
    # This is needed because in most real world scenarios, a session id is needed
    # It isn't really used here because we are using a simple in memory ChatMessageHistory
    lambda session_id: memory,
    input_messages_key="input",
    history_messages_key="chat_history",
)

In [16]:
agent_with_chat_history.invoke(
    {"input": "explain about the math expression about the Filtering api calls in the paper 'ToolFormer : Language Models Can Teach Themselves to Use Tools'"},
    config={"configurable": {"session_id": "<test_session_id>"}},
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `loadpaper` with `{'title': 'ToolFormer : Language Models Can Teach Themselves to Use Tools'}`


[0m[36;1m[1;3mHere is the title and section of the paper
title
Toolformer: Language Models Can Teach Themselves to Use Tools
sections
          Abstract
  1 Introduction
  2 Approach
        Sampling API Calls
        Executing API Calls
        Filtering API Calls
        Model Finetuning
        Inference
  3 Tools
        Question Answering
        Calculator
        Wikipedia Search
        Machine Translation System
        Calendar
  4 Experiments
    4.1 Experimental Setup
        Dataset Generation
        Model Finetuning
        Baseline Models
    4.2 Downstream Tasks
      4.2.1 LAMA
      4.2.2 Math Datasets
      4.2.3 Question Answering
      4.2.4 Multilingual Question Answering
      4.2.5 Temporal Datasets
    4.3 Language Modeling
    4.4 Scaling Laws
  5 Analysis
        Decoding Strategy
        

{'input': "explain about the math expression about the Filtering api calls in the paper 'ToolFormer : Language Models Can Teach Themselves to Use Tools'",
 'chat_history': [],
 'output': 'The paper "ToolFormer: Language Models Can Teach Themselves to Use Tools" discusses the filtering of API calls in the context of language models. Here is an explanation of the math expression related to filtering API calls as presented in the paper:\n\nThe paper introduces a weighted cross-entropy loss function for a model M over a sequence of tokens xi,...,xn, where i represents the position of the API call ci in the sequence and ri represents the response from the API. The loss function is defined as:\n\nLi(z) = -∑j=i^n wj-i * log pM(xj | z, x1:j-1)\n\nIn this expression:\n- z represents the prefix for the model M.\n- wj-i is a sequence of weights.\n- pM(xj | z, x1:j-1) is the conditional probability of token xj given the prefix z and the tokens x1 to xj-1.\n\nThe paper compares two variations of th

In [19]:
agent_with_chat_history.invoke(
    {"input": "explain about the Method"},
    config={"configurable": {"session_id": "<test_session_id>"}},
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `loadpaper` with `{'title': 'ToolFormer : Language Models Can Teach Themselves to Use Tools'}`


[0m[36;1m[1;3mHere is the title and section of the paper
title
Toolformer: Language Models Can Teach Themselves to Use Tools
sections
          Abstract
  1 Introduction
  2 Approach
        Sampling API Calls
        Executing API Calls
        Filtering API Calls
        Model Finetuning
        Inference
  3 Tools
        Question Answering
        Calculator
        Wikipedia Search
        Machine Translation System
        Calendar
  4 Experiments
    4.1 Experimental Setup
        Dataset Generation
        Model Finetuning
        Baseline Models
    4.2 Downstream Tasks
      4.2.1 LAMA
      4.2.2 Math Datasets
      4.2.3 Question Answering
      4.2.4 Multilingual Question Answering
      4.2.5 Temporal Datasets
    4.3 Language Modeling
    4.4 Scaling Laws
  5 Analysis
        Decoding Strategy
        

{'input': 'explain about the Method',
 'chat_history': [HumanMessage(content="explain about the math expression about the Filtering api calls in the paper 'ToolFormer : Language Models Can Teach Themselves to Use Tools'"),
  AIMessage(content='The paper "ToolFormer: Language Models Can Teach Themselves to Use Tools" discusses the filtering of API calls in the context of language models. Here is an explanation of the math expression related to filtering API calls as presented in the paper:\n\nThe paper introduces a weighted cross-entropy loss function for a model M over a sequence of tokens xi,...,xn, where i represents the position of the API call ci in the sequence and ri represents the response from the API. The loss function is defined as:\n\nLi(z) = -∑j=i^n wj-i * log pM(xj | z, x1:j-1)\n\nIn this expression:\n- z represents the prefix for the model M.\n- wj-i is a sequence of weights.\n- pM(xj | z, x1:j-1) is the conditional probability of token xj given the prefix z and the token

### Todo : Make Agent with Llama

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
# we can use 262k version of Llama!
model_name = 'gradientai/Llama-3-8B-Instruct-262k'

dtype = "float16"
if torch.cuda.is_bf16_supported():
    dtype = "bfloat16"

quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=getattr(torch, dtype), quantization_config=quantization_config,
                                             device_map="auto",  cache_dir=os.getenv("HF_HOME", "~/.cache/huggingface"))
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading shards:  25%|██▌       | 1/4 [00:00<00:02,  1.20it/s]

### (deprecated) HTML 

is moved to getpaper.py


In [12]:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import PyPDF2
import re


In [20]:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import PyPDF2
import re
import os

class GetPaper:
    def __init__(self, ss_api_key, ar5iv_mode = True, path_db='./papers_db', page_limit = 5):
        self.ss_api_key = ss_api_key
        self.ar5iv_mode = ar5iv_mode
        self.path_db= path_db
        self.page_limit = page_limit

    def get_paper_info_by_title(self, title):
        """논문의 제목으로 정보를 가져오는 함수"""
        # Define the API endpoint URL
        url = 'https://api.semanticscholar.org/graph/v1/paper/search?query={}&fields=paperId,title,abstract,authors,citations,fieldsOfStudy,influentialCitationCount,isOpenAccess,openAccessPdf,publicationDate,publicationTypes,references,venue'

        headers = {'x-api-key': self.ss_api_key}
        response = requests.get(url.format(title), headers=headers).json()

        if response.get('data'):
            paper = response['data'][0]
            return paper
        else:
            return None

    def get_ar5iv_url(self, paper):
        "논문의 ar5iv 주소를 받아오는 함수"
        external_ids = paper.get('openAccessPdf', {})
        arxiv_id = external_ids.get('url')
        if 'http' in arxiv_id:
            arxiv_id = arxiv_id.split('/')[-1]
            return f"https://ar5iv.org/abs/{arxiv_id}"
        else:
            return None

    def get_soup_from_url(self, url):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # HTTP 에러가 발생하면 예외를 발생시킴
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        except RequestException as e:
            print(f"Error fetching the URL: {e}")
            return None
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return None

    def get_header_from_soup(self, soup):
        # h1부터 h6까지 태그 추출
        headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

        # 태그와 내용을 계층 구조로 저장
        header_list = [(header.name, header.text.strip()) for header in headers]
        title = header_list[0][1]
        header_list = header_list[1:]
        return title, header_list


    def extract_text_under_headers(self, soup, text_list):
        # 결과를 저장할 변수
        results = []

        # 텍스트 리스트를 순회하며 각 텍스트에 해당하는 헤더와 그 아래의 텍스트를 추출
        for text in text_list:
            header_tag = soup.find(lambda tag: tag.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] and text in tag.get_text())
            if header_tag:
                header_text = header_tag.get_text(strip=False)
                header_level = int(header_tag.name[1])
                current_header = {'tag': header_tag.name, 'text': header_text, 'subsections': []}
                results.append(current_header)

                next_element = header_tag.find_next_sibling()
                while next_element:
                    if next_element.name and next_element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                        next_level = int(next_element.name[1])
                        if next_level <= header_level:
                            break

                        # If it's a tag and within our header range
                        if next_element.name and next_element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                            next_text = next_element.get_text(strip=False)
                            next_subheader = {'tag': next_element.name, 'text': next_text, 'subsections': []}
                            current_header['subsections'].append(next_subheader)
                            current_header = next_subheader
                            header_level = next_level
                    else:
                        if 'subsections' not in current_header:
                            current_header['subsections'] = []
                        current_header['subsections'].append({'tag': 'p', 'text': next_element.get_text(strip=False)})

                    next_element = next_element.find_next_sibling()

        content = ''
        for x in results:
            content += x['text']
            for y in x['subsections']:
                content += y['text']
        content = re.sub(r'\n{3,}', '\n\n', content) # 3번 이상 \n 이 연속되면 2번으로 줄이기
        return content

    def list_section(self, header_list):
        section_list = ''
        for tag, text in header_list:
            level = int(tag[1])  # 태그에서 레벨을 추출 (h1 -> 1, h2 -> 2, ..)
            section_list += '  ' * (level - 1) + text +'\n'
        return section_list


    def download_pdf(self, arxiv_id):
        """
        Download the PDF of a paper given its arXiv ID, if it does not already exist.
        """
        if not os.path.exists(self.path_db):
            os.makedirs(self.path_db)

        file_path = os.path.join(self.path_db, f'{arxiv_id}.pdf')

        if os.path.exists(file_path):
            print(f"File {file_path} already exists. Skipping download.")
            return file_path

        pdf_url = f'https://arxiv.org/pdf/{arxiv_id}.pdf'
        response = requests.get(pdf_url)

        if response.status_code != 200:
            raise Exception('Error downloading PDF from arXiv')

        with open(file_path, 'wb') as file:
            file.write(response.content)
        return file_path

    def read_pdf(self, arxiv_id, end_page=None):
        pdf_content = ""
        file_path = f'{self.path_db}/{arxiv_id}.pdf'

        try:
            with open(file_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                total_pages = len(reader.pages)

                if end_page is None or end_page > total_pages:
                    end_page = total_pages

                for page_num in range(1, end_page):
                    page = reader.pages[page_num-1]
                    pdf_content += page.extract_text()
                    if page_num == self.page_limit:
                        print('Page limit reached at', self.page_limit + 1)
                        break

        except FileNotFoundError:
            return f"Error: The file {file_path} does not exist."
        except Exception as e:
            return f"An error occurred while reading the file: {e}"

        pdf_content = re.sub(r'\s+', ' ', pdf_content).strip()
        return pdf_content


    def load_paper(self, title:str, sections:list=None):
        '''
        INPUT : title of paper,
                list of sections in paper
        OUTPUT : text of the paper
        '''
        paper = self.get_paper_info_by_title(title)
        url = self.get_ar5iv_url(paper)
        soup = self.get_soup_from_url(url) if self.ar5iv_mode else None
        if (soup):
            title, header_list = self.get_header_from_soup(soup)
            if sections == None:
                sections_list = self.list_section(header_list)
                instruction_for_agent = f'Here is the title and section of the paper\ntitle\n{title}\nsections\n{sections_list}\n\n Use the \'loadpaper\' tool again, specifying the exact sections you want to view in detail.'
                return instruction_for_agent
            else:
                return self.extract_text_under_headers(soup, sections)
        else: # case for ar5iv is not exist or request error
            arxiv_id = url.split('/')[-1]
            download_path = self.download_pdf(arxiv_id)
            pdf_content = self.read_pdf(arxiv_id)
            return pdf_content




In [21]:
GetPaperModule = GetPaper(ss_api_key)

In [22]:
title = 'Large Language Model Connected with Massive APIs'
text_list = ['Methodology']
text = GetPaperModule.load_paper(title)

In [23]:
print(text)


Here is the title and section of the paper
title
Gorilla: Large Language Model Connected with Massive APIs
sections
          Abstract
  1 Introduction
  2 Related Work
        Large Language Models
        Tool Usage
        LLMs for Program Synthesis
  3 Methodology
    3.1 Dataset Collection
        API Documentation
        Instruction Generation
    3.2 Gorilla
        API Call with Constraints
        Retriever-Aware training
        Gorilla Inference
    3.3 Verifying APIs
        AST Sub-Tree Matching
  4 Evaluation
        Baselines
        Retrievers
    4.1 AST Accuracy on API call
        Finetuning without Retrieval
        Finetuning with Retrieval
        Hallucination with LLM
    4.2 Test-Time Documentation Change
    4.3 API Call with Constraints
  5 Conclusion
  6 Limitations & Social Impacts
  7 Acknowledgement
  References
  8 Appendix
    8.1 Dataset Details
        Domain Classification
        API Call Task
        API Provider Component
        Explanation Elem