In [2]:
import os
from dotenv import load_dotenv

import sys
from semanticscholar import SemanticScholar

import requests

dotenv_path = '.env'
load_dotenv(dotenv_path=dotenv_path)

openai_api_key = os.getenv("OPENAI_API_KEY")
ss_api_key = os.getenv("SS_API_KEY")

# 전략 1
- 논문을 인용(citation)한 다른 논문들의 초록에서 한계점을 언급한 것이 있는지 찾아보기
- 논문을 참조(reference)한 다른 논문들의 초록에서 한계점을 언급한 것이 있는지 찾아보기
- 찾게 된다면 언급이 된 그 논문이 개선할 수 있는 아이디어일 것임

In [6]:
def get_paper_info_by_title(title, ss_api_key):
    """논문의 제목으로 정보를 가져오는 함수"""
    # Define the API endpoint URL
    url = 'https://api.semanticscholar.org/graph/v1/paper/search?query={}&fields=paperId,title,abstract,authors,citations,fieldsOfStudy,influentialCitationCount,isOpenAccess,openAccessPdf,publicationDate,publicationTypes,references,venue'
    
    headers = {'x-api-key': ss_api_key}
    response = requests.get(url.format(title), headers=headers).json()

    if response.get('data'):
        paper = response['data'][0]
        return paper
    else:
        return None

def get_citing_papers(paper_id, ss_api_key):
    """입력한 id에 해당하는 논문을 인용한 논문들의 제목과 초록을 가져오는 함수"""
    url = f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations?fields=title,abstract'
    
    headers = {'x-api-key': ss_api_key}
    response = requests.get(url, headers=headers).json()
    
    if response.get('data'):
        return response['data']
    else:
        return []

def get_referenced_papers(paper_id, ss_api_key):
    url = f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/references?fields=title,abstract'
    
    headers = {'x-api-key': ss_api_key}
    response = requests.get(url, headers=headers).json()
    
    if response.get('data'):
        return response['data']
    else:
        return []

def find_limitations_in_paper(texts):
    """한계점이라고 언급된 부분을 감지하는 함수"""
    keywords = [
        'limitation', 'limitations', 'drawback', 'shortcoming', 'constraint', 'weakness',
        'future work', 'future direction', 'challenge', 'issue', 'problem', 'restriction',
        'flaw', 'hurdle', 'barrier', 'difficulty', 'risk', 'pitfall', 'concern', 'obstacle',
        'gap', 'disadvantage', 'incompleteness', 'improvement'
    ]
    limitations = []
    
    for text in texts:
        abstract = text.get('abstract', '').lower()
        if any(keyword in abstract for keyword in keywords):
            limitations.append((text['title'], text['abstract']))
    
    return limitations

In [7]:
# Example usage
title = "Toolformer: Language Models Can Teach Themselves to Use Tools"
paper = get_paper_info_by_title(title, ss_api_key)

if paper:
    paper_id = paper['paperId']
    citing_papers = get_citing_papers(paper_id, ss_api_key)
    limitations_in_citations = find_limitations_in_paper(citing_papers)
    
    if limitations_in_citations:
        print("\nLimitations found in citing papers:")
        for title, limitations in limitations_in_citations:
            print(f"Title: {title}")
            for limitation in limitations:
                print(f" - {limitation}")
    else:
        print("\nNo limitations found in citing papers.")
    
    # Get referenced papers and find limitations
    referenced_papers = get_referenced_papers(paper_id, ss_api_key)
    limitations_in_references = find_limitations_in_paper(referenced_papers)
    
    if limitations_in_references:
        print("\nLimitations found in referenced papers:")
        for title, limitations in limitations_in_references:
            print(f"Title: {title}")
            for limitation in limitations:
                print(f" - {limitation}")
    else:
        print("\nNo limitations found in referenced papers.")
else:
    print("No paper found with the given title.")


No limitations found in citing papers.

No limitations found in referenced papers.


# 전략 2
- 논문의 내용을 직접 가져와서 거기서 한계점 및 향후 연구 언급 부분을 찾아보기
- pdf 다운로드 방식보다는 웹에서 긁어올 수 있으면 좋겠음

In [8]:
import requests
from bs4 import BeautifulSoup

In [9]:
keywords = [
    'limitation', 'limitations', 'drawback', 'shortcoming', 'constraint', 'weakness',
    'future work', 'future direction', 'challenge', 'issue', 'problem', 'restriction',
    'flaw', 'hurdle', 'barrier', 'difficulty', 'risk', 'pitfall', 'concern', 'obstacle',
    'gap', 'disadvantage', 'incompleteness', 'improvement'
]

def get_arxiv_url(paper):
    "논문의 ar5iv 주소를 받아오는 함수"
    external_ids = paper.get('openAccessPdf', {})
    arxiv_id = external_ids.get('url')
    if 'http' in arxiv_id:
        arxiv_id = arxiv_id.split('/')[-1]
        return f"https://ar5iv.org/abs/{arxiv_id}"
    else:
        return None
    
def find_limitations_in_arxiv(url, sections:list=None):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    all_sections = soup.find_all('section')
    
    limitations = []
    for section in all_sections:
        # section_title_tag = section.find('h2', class_='ltx_title_section')
        # if section_title_tag:
        #     section_title = section_title_tag.get_text().strip().lower()
        #     if sections and not any(s.lower() in section_title for s in sections):
        #         print(section_title)
        #         continue

        paragraphs = section.find_all('p')
        for paragraph in paragraphs:
            text = paragraph.get_text().lower()
            if any(keyword in text for keyword in keywords):
                limitations.append(paragraph.get_text())
    
    if limitations:
        return list(set(limitations))
    else:
        return "No limitations found."

In [10]:
# Example usage
title = "Toolformer: Language Models Can Teach Themselves to Use Tools"
paper = get_paper_info_by_title(title, ss_api_key)

if paper:
    abstract = paper.get('abstract', '')
    print(f"Title: {paper['title']}")
    print(f"Abstract: {abstract}")
    
    arxiv_url = get_arxiv_url(paper)
    if arxiv_url:
        print(f"arXiv URL: {arxiv_url}")
        limitations = find_limitations_in_arxiv(arxiv_url)
        if isinstance(limitations, list):
            print("Limitations found in the paper:")
            for limitation in limitations:
                print(f"- {limitation}")
        else:
            print(limitations)
    else:
        print("No arXiv ID available for this paper.")
else:
    print("No paper found with the given title.")

Title: Toolformer: Language Models Can Teach Themselves to Use Tools
Abstract: Language models (LMs) exhibit remarkable abilities to solve new tasks from just a few examples or textual instructions, especially at scale. They also, paradoxically, struggle with basic functionality, such as arithmetic or factual lookup, where much simpler and smaller models excel. In this paper, we show that LMs can teach themselves to use external tools via simple APIs and achieve the best of both worlds. We introduce Toolformer, a model trained to decide which APIs to call, when to call them, what arguments to pass, and how to best incorporate the results into future token prediction. This is done in a self-supervised way, requiring nothing more than a handful of demonstrations for each API. We incorporate a range of tools, including a calculator, a Q\&A system, two different search engines, a translation system, and a calendar. Toolformer achieves substantially improved zero-shot performance across a v

# 전략 3
- 전략 2처럼 하면 참조한 논문들의 한계나 선택하지 않은 방법론들의 한계점 등도 너무 많이 잡힌다.
- 그래서 이 논문의 abstract, conclusion과 함께 이 논문을 인용한 논문들의 abstract를 주고 발전 방향에 대해 생각해보라고 지시하면 어떨까?
- 그럼 명확히 '한계점'은 아니라도 더 개선할 여지가 있는 지점을 얘기해줄 것 같다.

In [2]:
import semantic_scholoar_api as ss

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [3]:
from pprint import pprint

def get_paper_info_by_title(title, ss_api_key):
    """논문의 제목으로 정보를 가져오는 함수"""
    # Define the API endpoint URL
    url = 'https://api.semanticscholar.org/graph/v1/paper/search?query={}&fields=paperId,title,abstract,authors,citations,fieldsOfStudy,influentialCitationCount,isOpenAccess,openAccessPdf,publicationDate,publicationTypes,references,venue'
    
    headers = {'x-api-key': ss_api_key}
    response = requests.get(url.format(title), headers=headers).json()

    if response.get('data'):
        paper = response['data'][0]
        return paper
    else:
        return None

def get_citing_papers(paper_id, api_key):
    # Define the API endpoint URL
    url = f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations'

    # Define the query parameters
    # query_params = {'fields': 'title,authors,year,abstract,influentialCitationCount'}
    query_params = {'fields': 'title,year,influentialCitationCount,abstract'}

    # Define the headers
    headers = {'x-api-key': api_key}

    # Make the request
    response = requests.get(url, params=query_params, headers=headers).json()
    try:
        real_data = [data["citingPaper"] for data in response['data']]
        return real_data
    except:
        return []

query = "Toolformer: Language Models Can Teach Themselves to Use Tools"
paper_count = 5
paper_info = get_paper_info_by_title(query, ss_api_key)
paper_id = paper_info["paperId"]
query_paper_abstract = paper_info["abstract"]
citation_paper_abstract = []
if paper_id:
    # Get the papers that cite the given paper ID
    citing_papers = get_citing_papers(paper_id, ss_api_key)
    
    sorted_papers = sorted(citing_papers, key=lambda x: x.get('influentialCitationCount', 0), reverse=True)[:paper_count]

    print(f"{query}\nAbstract: {query_paper_abstract}\n")
    
    print(f"Papers citing '{query}' sorted by influential citation count:")
    for paper in sorted_papers:
        title = paper['title']
        # authors = ', '.join([author['name'] for author in paper['authors']])
        year = paper['year']
        abstract = paper.get('abstract', 'No abstract available')
        citation_paper_abstract.append(abstract)
        influential_citation_count = paper.get('influentialCitationCount', 0)
        print(f"\nTitle: {title}\nYear: {year}\nInfluential Citation Count: {influential_citation_count}\nAbstract: {abstract}\n")
else:
    print("No paper found with the given title.")

Toolformer: Language Models Can Teach Themselves to Use Tools
Abstract: Language models (LMs) exhibit remarkable abilities to solve new tasks from just a few examples or textual instructions, especially at scale. They also, paradoxically, struggle with basic functionality, such as arithmetic or factual lookup, where much simpler and smaller models excel. In this paper, we show that LMs can teach themselves to use external tools via simple APIs and achieve the best of both worlds. We introduce Toolformer, a model trained to decide which APIs to call, when to call them, what arguments to pass, and how to best incorporate the results into future token prediction. This is done in a self-supervised way, requiring nothing more than a handful of demonstrations for each API. We incorporate a range of tools, including a calculator, a Q\&A system, two different search engines, a translation system, and a calendar. Toolformer achieves substantially improved zero-shot performance across a variety 

In [4]:
system_prompt = f""" 
내가 쓴 논문의 초록과, 내 논문을 인용한 논문들의 초록을 몇가지 보여주겠다. 
내 논문을 바탕으로 어떤 점을 개선시켰는지, 또는 어떤 점에서 아이디어를 얻었는지 추론하라. 
그리고 그것을 종합하여 내 논문에서 영감을 주는 포인트를 보기 좋게 정리하라.
"""
user_prompt = ""
user_prompt += f"""
### 내 논문의 초록 
{query_paper_abstract}

"""
for idx, citabs in enumerate(citation_paper_abstract):
    user_prompt += f"""### 내 논문을 인용한 논문의 초록 {idx + 1}\n"""
    user_prompt += f"""{citation_paper_abstract[idx]}\n\n"""

user_prompt += """
### 답변 형식 예시
# "이 논문은 A(인용 논문 1에 대한 내용)라는 부분에서 영감을 줄 수 있고, B(인용 논문 2에 대한 내용) 부분에서 추가적인 시도가 있을 수 있습니다. C(인용 논문 3에 대한 내용) 부분에서는..."
"""

In [5]:
print(system_prompt)
print(user_prompt)

 
내가 쓴 논문의 초록과, 내 논문을 인용한 논문들의 초록을 몇가지 보여주겠다. 
내 논문을 바탕으로 어떤 점을 개선시켰는지, 또는 어떤 점에서 아이디어를 얻었는지 추론하라. 
그리고 그것을 종합하여 내 논문에서 영감을 주는 포인트를 보기 좋게 정리하라.


### 내 논문의 초록 
Language models (LMs) exhibit remarkable abilities to solve new tasks from just a few examples or textual instructions, especially at scale. They also, paradoxically, struggle with basic functionality, such as arithmetic or factual lookup, where much simpler and smaller models excel. In this paper, we show that LMs can teach themselves to use external tools via simple APIs and achieve the best of both worlds. We introduce Toolformer, a model trained to decide which APIs to call, when to call them, what arguments to pass, and how to best incorporate the results into future token prediction. This is done in a self-supervised way, requiring nothing more than a handful of demonstrations for each API. We incorporate a range of tools, including a calculator, a Q\&A system, two different search engines, a translation system, and a cale

In [6]:
import openai

# OpenAI API 키 설정
openai.api_key = openai_api_key

def generate_response(system_prompt, user_prompt):
    response = openai.ChatCompletion.create(
        model="gpt-4-turbo-2024-04-09",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )
    return response['choices'][0]['message']['content']

# 시스템 프롬프트와 사용자 프롬프트 설정
user_prompt = "Can you explain how to use OpenAI's GPT-4 API for generating text?"

# 응답 생성
response = generate_response(system_prompt, user_prompt)
print(response)

APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742
