In [12]:
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from dotenv import load_dotenv
from langchain.vectorstores import FAISS

load_dotenv()

True

In [13]:
#Reading PDFs
def get_pdf_text(pdf_docs):
    text=""
    for pdf in pdf_docs:
        pdf_reader= PdfReader(pdf)
        for page in pdf_reader.pages:
            text+= page.extract_text()
    return  text

In [14]:
#Creating Chunks
def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
    chunks = text_splitter.split_text(text)
    return chunks

In [15]:
embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001", task_type="retrieval_query")

In [16]:
#generating vector store
def get_vector_store(text_chunks):
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")

In [17]:
llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
prompt = hub.pull("rlm/rag-prompt")

In [24]:
raw_text = get_pdf_text(['./CLIP_2103.00020v1.pdf','Apple_DCLM_2406.11794v3.pdf','BERT_1810.04805v2.pdf','Carbon_Footprint_2204.05149v1.pdf','Computer_Vision_Dataset_Balance1908.04913v1.pdf','Context_Length_2407.11963v1.pdf','Distil-BERT_1910.01108v4.pdf'
                        ,'Gemini_2312.11805v4.pdf','Roberta_1911.02116v2.pdf','SigLIP_2303.15343v4.pdf','T5_1910.09700v2.pdf'])
text_chunks = get_text_chunks(raw_text)
get_vector_store(text_chunks)

In [None]:
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
retriever=new_db.as_retriever()

In [25]:
rag_chain = (
    {"context": retriever , "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [28]:
rag_chain.invoke("""we evaluate the proposed SigLiT and
SigLIP models across a wide range of batch sizes. We dis-
cuss what can be achieved with a small number of accel-
erator chips, using both SigLiT and SigLIP recipes.""")

'The authors evaluate the SigLiT and SigLIP models across a wide range of batch sizes. They discuss what can be achieved with a small number of accelerator chips, using both SigLiT and SigLIP recipes. They also briefly discuss the impact of batch size on multilingual language image pre-training.'