In [1]:
## DATA ingestion
from langchain_community.document_loaders import TextLoader 
text_loader = TextLoader('speech.txt', encoding='charmap')
text_documents = text_loader.load()
text_documents

[Document(metadata={'source': 'speech.txt'}, page_content='â\x80\x9cIf, I say, for I would not impress by declamation when Reason offers her sober light, if they be really capable of acting like rational creatures, let them not be treated like slaves; or, like the brutes who are dependent on the reason of man, when they associate with him; but cultivate their minds, give them the salutary, sublime curb of principle, and let them attain conscious dignity by feeling themselves only dependent on God. Teach them, in common with man, to submit to necessity, instead of giving, to render them more pleasing, a sex to morals.\n\nFurther, should experience prove that they cannot attain the same degree of strength of mind, perseverance, and fortitude, let their virtues be the same in kind, though they may vainly struggle for the same degree; and the superiority of man will be equally clear, if not clearer; and truth, as it is a simple principle, which admits of no modification, would be common to

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

groq_api_key = os.getenv('GROQ_API_KEY')

if groq_api_key is not None:
	os.environ['GROQ_API_KEY'] = groq_api_key
else:
	raise ValueError("GROQ_API_KEY is not set in the environment variables.")

In [3]:
# WEB BASED LOADER
from langchain_community.document_loaders import WebBaseLoader
from bs4 import SoupStrainer  # Correct import

# load, chunk and index the content of the html page
loader = WebBaseLoader(
    web_path="https://www.perplexity.ai/hub/blog/accelerating-sonar-through-speculation",
    bs_kwargs=dict(parse_only=SoupStrainer(
        class_=("post-title", "post_content", "post_header")
    ))
)


text_documents=loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
text_documents

[Document(metadata={'source': 'https://www.perplexity.ai/hub/blog/accelerating-sonar-through-speculation'}, page_content='')]

In [10]:
# data ingestion from pdf
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('2502.15840v1.pdf')
docs=loader.load()


In [11]:
docs

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-02-25T01:04:48+00:00', 'author': '', 'keywords': '', 'moddate': '2025-02-25T01:04:48+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '2502.15840v1.pdf', 'total_pages': 28, 'page': 0, 'page_label': '1'}, page_content='Vending-Bench: A Benchmark for Long-Term Coherence\nof Autonomous Agents\nAndon Labs\nAxel Backlund and Lukas Petersson\nfounders@andonlabs.com\nFebruary 2025\nAbstract\nWhile Large Language Models (LLMs) can exhibit impressive proficiency in isolated,\nshort-term tasks, they often fail to maintain coherent performance over longer time horizons. In\nthis paper, we present Vending-Bench, a simulated environment designed to specifically test an\nLLM-based agent’s ability to manage a straightforward, long-running business scenario: operat-\ning a

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Example usage to avoid "not accessed" warning
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents=text_splitter.split_documents(text_documents)
documents[:500]

[]