In [1]:
from langchain.document_loaders import ArxivLoader
from openai import OpenAI
import regex as re
import requests
from upload_notion import NotionDatabase, transform_date
from datetime import datetime
import arxiv
import json
client = arxiv.Client()
llm = OpenAI(api_key="sk-")
database_id = ""
notion_key = ""

In [2]:
def get_keywords(llm, docs):
    print("Extracting keywords from summary...")
    try:
        script = docs[0].page_content[:10000]
    except:
        script = docs[1].page_content[:10000]
    template = """Choose 5 most important and well represent paper keywords of the following paper:
    "{paper}"
    KEYWORDS:
    """
    question = "Choose 5 most important and well represent paper keywords of the following paper:" + script + """
    example:
    keyword 1, keyword 2, keyword 3, keyword 4, keyword 5.
    separate keywords with comma and space.
    """
    

    answer = llm.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": template},
            {"role": "assistant", "content": question},
            {"role": "user", "content": "KEYWORDS:"},
        ],
        temperature=0.0,
        top_p=1.0,
    )
    answer = answer.choices[0].message.content
    keywords = []
    try:
        for i in range(5):
            #split by comma or carriage return
            keywords.append(re.split(",|\n", answer)[i].strip())
    except:
        print("Warning: Less than 5 keywords were extracted.")
    #print(keywords)
    #만약 키워드의 시작이 1. 2. 3. 4. 5. 이런식으로 되어있으면 제거
    for i in range(5):
        if re.match(r"^\d\.", keywords[i]):
            keywords[i] = keywords[i][2:]
    return keywords

In [3]:
def get_summary(llm, docs):
    print("Extracting summary from paper...")
    try:
        script = docs[0].page_content[:10000]
    except:
        script = docs[1].page_content[:10000]
    template = """
    SUMMARY:
    """
    question = "Write a summary of the following paper as korean :" + script
    answer = llm.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": question},
            {"role": "assistant", "content": template},
            {"role": "user", "content": "SUMMARY:"},
        ],
        temperature=0.0,
        top_p=1.0,
    )
    return answer.choices[0].message.content

In [4]:
def get_metadata(init):
    docs = ArxivLoader(query=init, load_max_docs=3).load()
    print("Getting metadata from arxiv...")
    pdf_URL = f"https://arxiv.org/pdf/{init}.pdf"
    response = requests.get(f"https://arxiv.org/abs/{init}")
    taxonomy = re.search(r'<span class="primary-subject">(.*?)</span>', response.text).group(1)
    return docs, pdf_URL, taxonomy

In [6]:
term = "Table data augmentation, Categorical data, LLM, Tablular Data Generation"
#Relevance, LastUpdatedDate, SubmittedDate
sort_by = "relevance"


search = arxiv.Search(
  query = term,
  max_results = 10,
  sort_by = arxiv.SortCriterion.Relevance if sort_by == "relevance" else arxiv.SortCriterion.LastUpdatedDate if sort_by == "last_updated_date" else arxiv.SortCriterion.SubmittedDate
  
)
result = client.results(search)
all_results = list(result)
if len(all_results) == 0:
    print("No results found.")
    exit(0)

for i in range(len(all_results)):
    init = all_results[i].__dict__["entry_id"].split("/")[-1]
    docs, pdf_URL, taxonomy = get_metadata(init)
    print(f"Processing {init}... Title: {docs[0].metadata['Title']}")
    cache = json.load(open("cache.json", "r"))
    if init in cache:
        page_values = cache[init]
    else:
        try:
            summary = get_summary(llm, docs)
            keywords = get_keywords(llm, docs)
            page_values = {
            'Search term and sort by': ", ".join([term, sort_by]),
            'Taxonomy': taxonomy,
            'Keywords': keywords,
            'Summary': summary.strip(),
            'Title': docs[0].metadata['Title'],
            'Published': transform_date(datetime.strftime(datetime.strptime(docs[0].metadata['Published'], '%Y-%m-%d'), '%Y%m%d%H%M%S')),
            'URL': pdf_URL,
            }
            cache[init] = page_values
            json.dump(cache, open("cache.json", "w"))
        except:
            print(f"Error: Unable to extract summary and keywords. id: {init}")
            continue
    notion_db = None #초기화하지 않으면 Keyword에서 중복되는 문제가 발생함. Notion측 문제로 보임.
    print("Uploading to Notion...")
    notion_db = NotionDatabase(database_id, notion_key)
    notion_db.upload_page_values(page_values)


Getting metadata from arxiv...
Processing 2311.11628v1... Title: Incorporating LLM Priors into Tabular Learners
Uploading to Notion...
Getting metadata from arxiv...
Processing 2311.16267v2... Title: Novel Preprocessing Technique for Data Embedding in Engineering Code Generation Using Large Language Model
Uploading to Notion...
Getting metadata from arxiv...
Processing 2312.12112v2... Title: Curated LLM: Synergy of LLMs and Data Curation for tabular augmentation in ultra low-data regimes
Uploading to Notion...
Getting metadata from arxiv...
Processing 2311.01918v1... Title: Large Language Models Illuminate a Progressive Pathway to Artificial Healthcare Assistant: A Review
Uploading to Notion...
Getting metadata from arxiv...
Processing 2402.09939v1... Title: Generative AI in the Construction Industry: A State-of-the-art Analysis
Uploading to Notion...
Getting metadata from arxiv...
Processing 2402.12869v1... Title: Exploring the Impact of Table-to-Text Methods on Augmenting LLM-based Q