In [None]:
"""
Workflow:
1. Obtaining the pdfs
2. Processing the pdf to extract text
3. Converting texts into chunks
4. Converting chunks into embeddings
5. Storing the embeddings in faiss
6. Obtaining the user query
7. Processing the query and converting into embeddings
8. Performing similarity serach in the vectorstore
9. Retrieving the results and prompting the LLM with the context and question
10. Displaying the result
"""

## Extracing texts from pdfs

In [None]:

from PyPDF2 import PdfReader
import os


def get_pdf_texts(pdf_filepaths):
    """
    Extract text from list of pdf files and retunns a text
    """
    text = ""
    for filepath in pdf_filepaths:
        try:
            with open(filepath, 'rb') as pdf_file:  # Open in binary mode for PDFs
                reader = PdfReader(pdf_file)
                for page in reader.pages:
                    page_text = page.extract_text()
                    text += page_text

        except FileNotFoundError:
            print(f"Error: File '{filepath}' not found. Skipping.")

    # for pdf in pdf_docs:
    #     pdf_reader = PdfReader(pdf)
    #     for page in pdf_reader.pages:
    #         text += page.extract_text()

    return text

filename = 'pdf1.pdf'

pdf_directory = os.getcwd()  # Get current working directory
pdf_files = [os.path.join(pdf_directory, filename) for filename in os.listdir(pdf_directory) if filename.endswith('.pdf')]  # Filter for PDF files

if pdf_files:
  extracted_text = get_pdf_texts(pdf_files)
  print(extracted_text)
else:
  print("No PDF files found in the current directory.")

## Converting texts into chunks

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 5000, chunk_overlap = 2500)
    chunks = text_splitter.split_text(text)
    return chunks

In [13]:
chunks = get_text_chunks(extracted_text)

## Converting chunks into embeddings Storing the embeddings in faiss

In [11]:
from langchain.vectorstores import FAISS
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
import os
import google.generativeai as genai

load_dotenv()

api_key = os.getenv("GOOGLE_API")
genai.configure(api_key=api_key)

In [12]:
def get_vector_store(text_chunks):
    embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")

In [14]:
get_vector_store(chunks)

In [15]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain

def get_qa_chain():
    template = """
    Answer the given question according to the context provided. Include the details.
    If answer is not in the provided context simply answer "I dont know the answer", donot provide wrong answer
    \n
    context: {context}
    \n
    question: {query}
    \n

    Answer: 
"""

    llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.9, convert_system_message_to_human=True)

    prompt = PromptTemplate(template=template, input_variables=['context', 'query'])

    chain = load_qa_chain(llm=llm, chain_type='stuff', prompt = prompt)

    return chain

In [23]:
def get_response(query):
    embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
    db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization= True)
    relevant_docs = db.similarity_search(query)

    chain = get_qa_chain()

    response = chain(
        {
            "input_documents": relevant_docs,
            'query': query,
        },
        return_only_outputs=True
    )

    print(response)

get_response("What is the capital of Nepal")

{'output_text': 'I dont know the answer'}
