In [33]:
from tools import fetch_arxiv_papers

papers = fetch_arxiv_papers('Language Models', 10)

In [39]:
[paper["title"] for paper in papers]

['Re-Align: Aligning Vision Language Models via Retrieval-Augmented Direct Preference Optimization',
 'UniGuardian: A Unified Defense for Detecting Prompt Injection, Backdoor Attacks and Adversarial Attacks in Large Language Models',
 'STEER-ME: Assessing the Microeconomic Reasoning of Large Language Models',
 'Performance Evaluation of Large Language Models in Statistical Programming',
 'Text2World: Benchmarking Large Language Models for Symbolic World Model Generation',
 'SimpleVQA: Multimodal Factuality Evaluation for Multimodal Large Language Models',
 'Do we still need Human Annotators? Prompting Large Language Models for Aspect Sentiment Quad Prediction',
 'Towards a Design Guideline for RPA Evaluation: A Survey of Large Language Model-Based Role-Playing Agents',
 'B-cos LM: Efficiently Transforming Pre-trained Language Models for Improved Explainability',
 'Reasoning-to-Defend: Safety-Aware Reasoning Can Defend Large Language Models from Jailbreaking']

In [48]:
from llama_index.core import Document

def create_documents_from_papers(papers):
    documents = []
    for paper in papers:
        content = (
            f"Title: {paper['title']}\n"
            f"Published: {paper['published']}\n"
            f"Authors: {', '.join(paper['authors'])}\n"
            f"Journal: {paper['journal_ref']}\n"
            f"DOI: {paper['doi']}\n"
            f"Primary category: {paper['primary_category']}\n"
            f"Categories: {', '.join(paper['categories'])}\n"
            f"Summary: {paper['summary']}\n"
            f"URL: {paper['pdf_url']}\n"
            f"Arxiv URL: {paper['arxiv_url']}\n"
        )

        documents.append(Document(text=content))
    return documents

In [49]:
documents = create_documents_from_papers(papers)

In [52]:
documents


[Document(id_='e443a13b-14ff-4dc6-ba87-27ae598b96b7', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Title: Re-Align: Aligning Vision Language Models via Retrieval-Augmented Direct Preference Optimization\nPublished: 2025-02-18 18:59:57+00:00\nAuthors: Shuo Xing, Yuping Wang, Peiran Li, Ruizheng Bai, Yueqi Wang, Chengxuan Qian, Huaxiu Yao, Zhengzhong Tu\nJournal: None\nDOI: None\nPrimary category: cs.CV\nCategories: cs.CV, cs.LG\nSummary: The emergence of large Vision Language Models (VLMs) has broadened the scope\nand capabilities of single-modal Large Language Models (LLMs) by integrating\nvisual modalities, thereby unlocking transformative cross-modal applications in\na variety of real-world scenarios. Despite their impressive performance, VLMs\nare prone to significant hallucinations, particularly 

In [55]:
from llama_index.core import Settings, VectorStoreIndex
from constants import embed_model

Settings.chunk_size = 1024  
Settings.chunk_overlap = 50

index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

In [56]:
index.storage_context.persist('index/')