In [None]:
!pip install -q langchain faiss-cpu unstructured PyPDF2
!pip install -q huggingface_hub
!pip install -U langchain-community langchain-huggingface
!pip install -q langchain-huggingface
!pip install transformers datasets tqdm ddgs

In [None]:
import os
import json
from tqdm import tqdm
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# PDF Embedding

In [None]:
pdf_folder = "/kaggle/input/investing-books-pdf"  # change path if needed
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

In [None]:
all_documents = []

for pdf_path in pdf_files:
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()  # this returns list of Document objects
    all_documents.extend(docs)

print(f"Loaded {len(all_documents)} documents from PDFs.")

In [None]:
print(all_documents[51])

In [None]:
# 3) Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,   
    chunk_overlap=350  
)

split_docs = text_splitter.split_documents(all_documents)
print(f"Created {len(split_docs)} text chunks.")

In [None]:
split_docs[159].page_content

In [None]:
# 4) Create embeddings
from langchain.embeddings import HuggingFaceInstructEmbeddings

# Use HF Inference API instead of local model
embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-large",
    model_kwargs={
        "device": "cpu",        # still needed, API will handle
        "use_auth_token": "YOUR_HF_API_TOKEN"  # required to access HF Inference API
    }
)

In [None]:
# 5) Build FAISS vector store
# -------------------------------
vector_db = FAISS.from_documents(split_docs, embeddings)

# Web Retriever

In [None]:
from ddgs import DDGS

# Create a DuckDuckGo search object
with DDGS() as ddgs:
    query = "latest financial results of Apple 2025"
    results = list(ddgs.text(query, max_results=3))  # limit to 3 results
    
results

# Web PDF Retriever