<a href="https://colab.research.google.com/github/GunnamareddySusmitha/sithafal/blob/main/sithafal_ipynd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import fitz  # PyMuPDF for handling PDF files
from sentence_transformers import SentenceTransformer
import numpy as np

class PDFQuestionAnswering:
    def __init__(self):
        # Initialize the Sentence Transformer model
        self.model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and effective model

    def extract_text_from_pdf(self, pdf_path):
        """Extracts text from a PDF file."""
        try:
            pdf_document = fitz.open(pdf_path)
            text = ""
            for page in pdf_document:
                text += page.get_text()
            pdf_document.close()
            return text
        except Exception as e:
            print(f"Error reading PDF: {e}")
            return ""

    def split_into_sentences(self, text):
        """Splits text into sentences."""
        return text.split('.')

    def find_answer(self, question, sentences):
        """Finds the most relevant sentence for a given question."""
        try:
            # Get embeddings for question and sentences
            question_embedding = self.model.encode([question])[0]
            sentence_embeddings = self.model.encode(sentences)

            # Compute similarities
            similarities = [
                (sentence, self.cosine_similarity(question_embedding, sent_emb))
                for sentence, sent_emb in zip(sentences, sentence_embeddings)
            ]

            # Return the most similar sentence
            best_match = max(similarities, key=lambda x: x[1])
            return best_match[0]  # The best matching sentence
        except Exception as e:
            print(f"Error in finding answer: {e}")
            return "Unable to find an answer."

    @staticmethod
    def cosine_similarity(vec1, vec2):
        """Computes the cosine similarity between two vectors."""
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


def main():
    # Initialize the PDFQuestionAnswering system
    pdf_qa = PDFQuestionAnswering()

    # Input: Path to the PDF file
    pdf_path = input("Enter the path to your PDF file: ").strip()

    # Extract text from the PDF
    text = pdf_qa.extract_text_from_pdf(pdf_path)
    if not text:
        print("No text found in the PDF.")
        return

    # Split text into sentences
    sentences = pdf_qa.split_into_sentences(text)

    # Interactive question answering
    while True:
        question = input("\nEnter your question (or type 'quit' to exit): ").strip()
        if question.lower() == 'quit':
            print("Exiting the program.")
            break

        # Find the best answer
        answer = pdf_qa.find_answer(question, sentences)
        print(f"Answer: {answer}")


if __name__ == "__main__":
    main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Answer: The Solar System 
 
Our solar system consists of the Sun and everything that orbits around it
