In [12]:
import os
from dotenv import load_dotenv

import sys
from semanticscholar import SemanticScholar

import requests

dotenv_path = '.env'
load_dotenv(dotenv_path=dotenv_path)

openai_api_key = os.getenv("OPENAI_API_KEY")
ss_api_key = os.getenv("SS_API_KEY")

# 전략 1
- 논문을 인용(citation)한 다른 논문들의 초록에서 한계점을 언급한 것이 있는지 찾아보기
- 논문을 참조(reference)한 다른 논문들의 초록에서 한계점을 언급한 것이 있는지 찾아보기
- 찾게 된다면 언급이 된 그 논문이 개선할 수 있는 아이디어일 것임

In [33]:
def get_paper_info_by_title(title, ss_api_key):
    """논문의 제목으로 정보를 가져오는 함수"""
    # Define the API endpoint URL
    url = 'https://api.semanticscholar.org/graph/v1/paper/search?query={}&fields=paperId,title,abstract,authors,citations,fieldsOfStudy,influentialCitationCount,isOpenAccess,openAccessPdf,publicationDate,publicationTypes,references,venue'
    
    headers = {'x-api-key': ss_api_key}
    response = requests.get(url.format(title), headers=headers).json()

    if response.get('data'):
        paper = response['data'][0]
        return paper
    else:
        return None

def get_citing_papers(paper_id, ss_api_key):
    """입력한 id에 해당하는 논문을 인용한 논문들의 제목과 초록을 가져오는 함수"""
    url = f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations?fields=title,abstract'
    
    headers = {'x-api-key': ss_api_key}
    response = requests.get(url, headers=headers).json()
    
    if response.get('data'):
        return response['data']
    else:
        return []

def get_referenced_papers(paper_id, ss_api_key):
    url = f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references?fields=title,abstract'
    
    headers = {'x-api-key': ss_api_key}
    response = requests.get(url, headers=headers).json()
    
    if response.get('data'):
        return response['data']
    else:
        return []

def find_limitations_in_paper(texts):
    """한계점이라고 언급된 부분을 감지하는 함수"""
    keywords = [
        'limitation', 'limitations', 'drawback', 'shortcoming', 'constraint', 'weakness',
        'future work', 'future direction', 'challenge', 'issue', 'problem', 'restriction',
        'flaw', 'hurdle', 'barrier', 'difficulty', 'risk', 'pitfall', 'concern', 'obstacle',
        'gap', 'disadvantage', 'incompleteness', 'improvement'
    ]
    limitations = []
    
    for text in texts:
        abstract = text.get('abstract', '').lower()
        if any(keyword in abstract for keyword in keywords):
            limitations.append((text['title'], text['abstract']))
    
    return limitations

In [34]:
# Example usage
title = "Toolformer: Language Models Can Teach Themselves to Use Tools"
paper = get_paper_info_by_title(title, ss_api_key)

if paper:
    paper_id = paper['paperId']
    citing_papers = get_citing_papers(paper_id, ss_api_key)
    limitations_in_citations = find_limitations_in_paper(citing_papers)
    
    if limitations_in_citations:
        print("\nLimitations found in citing papers:")
        for title, limitations in limitations_in_citations:
            print(f"Title: {title}")
            for limitation in limitations:
                print(f" - {limitation}")
    else:
        print("\nNo limitations found in citing papers.")
    
    # Get referenced papers and find limitations
    referenced_papers = get_referenced_papers(paper_id, ss_api_key)
    limitations_in_references = find_limitations_in_paper(referenced_papers)
    
    if limitations_in_references:
        print("\nLimitations found in referenced papers:")
        for title, limitations in limitations_in_references:
            print(f"Title: {title}")
            for limitation in limitations:
                print(f" - {limitation}")
    else:
        print("\nNo limitations found in referenced papers.")
else:
    print("No paper found with the given title.")


No limitations found in citing papers.

No limitations found in referenced papers.


# 전략 2
- 논문의 내용을 직접 가져와서 거기서 한계점 및 향후 연구 언급 부분을 찾아보기
- pdf 다운로드 방식보다는 웹에서 긁어올 수 있으면 좋겠음

In [48]:
import requests
from bs4 import BeautifulSoup

In [66]:
keywords = [
    'limitation', 'limitations', 'drawback', 'shortcoming', 'constraint', 'weakness',
    'future work', 'future direction', 'challenge', 'issue', 'problem', 'restriction',
    'flaw', 'hurdle', 'barrier', 'difficulty', 'risk', 'pitfall', 'concern', 'obstacle',
    'gap', 'disadvantage', 'incompleteness', 'improvement'
]

def get_arxiv_url(paper):
    "논문의 ar5iv 주소를 받아오는 함수"
    external_ids = paper.get('openAccessPdf', {})
    arxiv_id = external_ids.get('url')
    if 'http' in arxiv_id:
        arxiv_id = arxiv_id.split('/')[-1]
        return f"https://ar5iv.org/abs/{arxiv_id}"
    else:
        return None
    
def find_limitations_in_arxiv(url, sections:list=None):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    all_sections = soup.find_all('section')
    
    limitations = []
    for section in all_sections:
        section_title_tag = section.find('h2', class_='ltx_title_section')
        if section_title_tag:
            section_title = section_title_tag.get_text().strip().lower()
            if sections and not any(s.lower() in section_title for s in sections):
                print(section_title)
                continue

        paragraphs = section.find_all('p')
        for paragraph in paragraphs:
            text = paragraph.get_text().lower()
            if any(keyword in text for keyword in keywords):
                limitations.append(paragraph.get_text())
    
    if limitations:
        return list(set(limitations))
    else:
        return "No limitations found."

In [68]:
# Example usage
title = "Toolformer: Language Models Can Teach Themselves to Use Tools"
paper = get_paper_info_by_title(title, ss_api_key)

if paper:
    abstract = paper.get('abstract', '')
    print(f"Title: {paper['title']}")
    print(f"Abstract: {abstract}")
    
    arxiv_url = get_arxiv_url(paper)
    if arxiv_url:
        print(f"arXiv URL: {arxiv_url}")
        limitations = find_limitations_in_arxiv(arxiv_url, ['introduction'])
        if isinstance(limitations, list):
            print("Limitations found in the paper:")
            for limitation in limitations:
                print(f"- {limitation}")
        else:
            print(limitations)
    else:
        print("No arXiv ID available for this paper.")
else:
    print("No paper found with the given title.")

Title: Toolformer: Language Models Can Teach Themselves to Use Tools
Abstract: Language models (LMs) exhibit remarkable abilities to solve new tasks from just a few examples or textual instructions, especially at scale. They also, paradoxically, struggle with basic functionality, such as arithmetic or factual lookup, where much simpler and smaller models excel. In this paper, we show that LMs can teach themselves to use external tools via simple APIs and achieve the best of both worlds. We introduce Toolformer, a model trained to decide which APIs to call, when to call them, what arguments to pass, and how to best incorporate the results into future token prediction. This is done in a self-supervised way, requiring nothing more than a handful of demonstrations for each API. We incorporate a range of tools, including a calculator, a Q\&A system, two different search engines, a translation system, and a calendar. Toolformer achieves substantially improved zero-shot performance across a v

In [60]:
len(limitations)

25

In [None]:
query = 'Toolformer: Language Models Can Teach Themselves to Use Tools'
num = 20
threshold = 0.6
recommend = 5