Build a PDF-powered QA system using embeddings + LangChain.

In [None]:
import PyPDF2  # Library for reading and extracting text from PDF files
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS  # Vector store for storing and searching text embeddings
from langchain.embeddings import HuggingFaceEmbeddings  # Generates embeddings using HuggingFace models
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
import os

In [None]:
try:
    from google.colab import userdata
    os.environ["GROQ_API_KEY"] = userdata.get("GROQ_API_KEY")
except ImportError:
    pass

In [None]:
class PDFQASystem:
    def __init__(self, pdf_path, chunk_size=1000, chunk_overlap=200, groq_api_key=None):
        """
        Initialize the PDF QA system.

        Args:
            pdf_path (str): Path to the PDF file
            chunk_size (int): Size of text chunks for processing
            chunk_overlap (int): Overlap between chunks
            groq_api_key (str, optional): Groq API key, if not using environment variable
        """
        self.pdf_path = pdf_path
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.groq_api_key = groq_api_key
        self.vector_store = None
        self.qa_chain = None

        # Initialize embeddings
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )

        # Initialize text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len
        )

    def extract_text_from_pdf(self):
        """Extract text from PDF file."""
        try:
            with open(self.pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text()
                return text
        except Exception as e:
            print(f"Error reading PDF: {e}")
            return None

    def create_vector_store(self):
        """Create vector store from PDF text."""
        # Extract text
        text = self.extract_text_from_pdf()
        if not text:
            return False

        # Split text into chunks
        texts = self.text_splitter.split_text(text)

        # Create vector store
        self.vector_store = FAISS.from_texts(
            texts,
            self.embeddings
        )
        return True

    def initialize_qa_chain(self):
        """Initialize the QA chain with Groq's ChatGroq model."""
        if self.vector_store is None:
            print("Vector store not initialized. Run create_vector_store first.")
            return False

        # Initialize Groq LLM
        try
            api_key = self.groq_api_key or os.environ.get("GROQ_API_KEY")
            if not api_key:
                raise ValueError("GROQ_API_KEY not found. Set the environment variable or pass groq_api_key to PDFQASystem.")

            llm = ChatGroq(
                model="llama-3.3-70b-versatile",
                temperature=0,
                max_tokens=None,
                groq_api_key=api_key
            )
        except Exception as e:
            print(f"Error initializing Groq LLM: {e}")
            return False

        # Create retrieval QA chain
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=self.vector_store.as_retriever(
                search_kwargs={"k": 3}  # Return top 3 relevant chunks
            )
        )
        return True

    def query(self, question):
        """Query the system with a question."""
        if self.qa_chain is None:
            print("QA chain not initialized. Run initialize_qa_chain first.")
            return None

        try:
            response = self.qa_chain.run(question)
            return response
        except Exception as e:
            print(f"Error processing query: {e}")
            return None

In [None]:
def main():
    # Example usage
    pdf_path = "/content/Machine Translation (1).pdf"
    groq_api_key = None

    # Initialize QA system
    qa_system = PDFQASystem(pdf_path, groq_api_key=groq_api_key)

    # Create vector store
    if not qa_system.create_vector_store():
        print("Failed to create vector store")
        return

    # Initialize QA chain
    if not qa_system.initialize_qa_chain():
        print("Failed to initialize QA chain")
        return

    questions = [
        "What is the main topic of the document?",
        "Summarize the key points."
    ]

    for question in questions:
        response = qa_system.query(question)
        if response:
            print(f"\nQ: {question}")
            print(f"A: {response}")

In [None]:
if __name__ == "__main__":
    main()

  response = self.qa_chain.run(question)



Q: What is the main topic of the document?
A: The main topic of the document is Machine Translation (MT).

Q: Summarize the key points.
A: Here are the key points summarized:

**Types of Machine Translation:**

1. **Rule-based Machine Translation**: Uses built-in linguistic rules and bilingual dictionaries to translate specific content accurately.
2. **Statistical Machine Translation**: Uses machine learning to analyze large amounts of human translations and make predictions based on statistical likelihood.
3. **Neural Machine Translation**: Uses artificial intelligence and neural networks to learn languages and improve translation accuracy.

**Machine Translation Process:**

1. Input text is prepared and filtered.
2. The system is trained using examples of texts in multiple languages and their translations.
3. The system learns patterns and probabilities of word and phrase translations.
4. The system generates a translated version of the input text.
5. Additional adjustments may be m