In [6]:
import os
from langchain.docstore.document import Document
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash', temperature=0.9)


In [7]:
import logging
import zipfile

import requests

logging.basicConfig(level=logging.INFO)

data_url = "https://storage.googleapis.com/benchmarks-artifacts/langchain-docs-benchmarking/cj.zip"
result = requests.get(data_url)
filename = "cj.zip"
with open(filename, "wb") as file:
    file.write(result.content)

with zipfile.ZipFile(filename, "r") as zip_ref:
    zip_ref.extractall()

In [8]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("./cj/cj.pdf")
docs = loader.load()
tables = []
texts = [d.page_content for d in docs]
full_document = " ".join(texts)


In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [10]:
# Split the full document into smaller chunks using RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_text(full_document)


In [11]:
# Create document objects from the splits
split_docs = [Document(page_content=chunk) for chunk in splits]  # Ensure each chunk is a Document object


In [14]:
# Embed the document chunks using the embedding model and store them in a vectorstore
vectorstore = Chroma.from_documents(documents=split_docs, embedding=embedding_model)

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [25]:
retriever = vectorstore.as_retriever()
prompt_template = hub.pull("rlm/rag-prompt")

In [20]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [26]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm
            | StrOutputParser()
        )

In [27]:
user_prompt = "help me summarize this"

res = rag_chain.invoke(user_prompt)

I0000 00:00:1727603877.958992 1791289 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [28]:
res

'This article from Clouded Judgement discusses the EV/NTM revenue multiple divided by NTM consensus growth expectations. The author analyzes the median multiples for different growth categories, including high growth, mid growth, and low growth. The article also notes that past performance is not indicative of future results. \n'