In [2]:
import os
from pdfminer.high_level import extract_text

def extract_text_from_pdf(pdf_path):
    try:
        return extract_text(pdf_path)
    except FileNotFoundError as e:
        print(f"File not found: {pdf_path}")
        raise e

def save_text_to_file(text, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(text)

# Specify the correct path to your PDF file
pdf_file = "The-Discovery-Of-India-Jawaharlal-Nehru (1).pdf"
output_file = "The-Discovery-Of-India-Jawaharlal-Nehru (1).txt"

if not os.path.exists(pdf_file):
    print(f"PDF file does not exist: {pdf_file}")
else:
    text = extract_text_from_pdf(pdf_file)
    save_text_to_file(text, output_file)
    print(f"Extracted and saved text from {pdf_file} to {output_file}")


Extracted and saved text from The-Discovery-Of-India-Jawaharlal-Nehru (1).pdf to The-Discovery-Of-India-Jawaharlal-Nehru (1).txt


In [3]:
class TreeNode:
    def __init__(self, identifier, content=None):
        self.identifier = identifier
        self.content = content
        self.children = []

    def add_child(self, node):
        self.children.append(node)

class HierarchicalTree:
    def __init__(self, root_content):
        self.root = TreeNode("root", root_content)

    def add_node(self, parent_id, identifier, content):
        parent_node = self.find_node(self.root, parent_id)
        if parent_node:
            new_node = TreeNode(identifier, content)
            parent_node.add_child(new_node)

    def find_node(self, current_node, identifier):
        if current_node.identifier == identifier:
            return current_node
        for child in current_node.children:
            result = self.find_node(child, identifier)
            if result:
                return result
        return None






In [4]:
def build_tree_from_text(text):
    tree = HierarchicalTree("Textbook Root")
    chapters = text.split("Chapter ")
    for i, chapter in enumerate(chapters[1:], 1):
        chapter_id = f"Chapter_{i}"
        tree.add_node("root", chapter_id, f"Chapter {chapter_id} content")
        sections = chapter.split("Section ")
        for j, section in enumerate(sections[1:], 1):
            section_id = f"{chapter_id}_Section_{j}"
            tree.add_node(chapter_id, section_id, f"Section {section_id} content")
    return tree

In [5]:
# Example usage:
with open("The-Discovery-Of-India-Jawaharlal-Nehru (1).txt", "r", encoding="utf-8") as file:
    text = file.read()

tree = build_tree_from_text(text)



In [6]:
from nltk.corpus import wordnet
from transformers import AutoTokenizer, AutoModel


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")




In [7]:
import numpy as np
def expand_query(query):
    synonyms = set()
    for word in query.split():
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name())
    return list(synonyms)

def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

In [26]:
from transformers import pipeline
from rank_bm25 import BM25Okapi
import numpy as np

# Initialize a transformer model for question answering
qa_pipeline = pipeline("question-answering")

# Example documents
documents = [
    "The Indian independence movement was a series of historic events with the ultimate aim of ending British rule in India.",
    "The movement began in the 1850s with the first major uprising against British authority."
]

# Initialize BM25
def initialize_bm25(docs):
    tokenized_docs = [doc.split() for doc in docs]
    bm25 = BM25Okapi(tokenized_docs)
    return bm25

def retrieve_docs(query, bm25, docs):
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    ranked_indices = np.argsort(scores)[::-1]
    return [docs[i] for i in ranked_indices]

def answer_question(query, retrieved_docs):
    answers = []
    for doc in retrieved_docs:
        result = qa_pipeline(question=query, context=doc)
        answers.append(result['answer'])
    return ' '.join(answers)


bm25 = initialize_bm25(documents)
query = "What major thing is done in 1850"
retrieved_docs = retrieve_docs(query, bm25, documents)
answer = answer_question(query, retrieved_docs)
print(answer)


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [31]:
# Example usage
from transformers import RagTokenizer, RagSequenceForGeneration

# Save the model and tokenizer
rag_tokenizer.save_pretrained("./rag_tokenizer")
rag_model.save_pretrained("./rag_model")


Non-default generation parameters: {'max_length': 50, 'min_length': 1, 'num_beams': 4, 'bad_words_ids': [[0, 0]], 'forced_eos_token_id': 2}


In [20]:
import streamlit as st
from transformers import pipeline
from rank_bm25 import BM25Okapi
import numpy as np

# Initialize BM25 and QA pipeline
def initialize_bm25(docs):
    tokenized_docs = [doc.split() for doc in docs]
    return BM25Okapi(tokenized_docs)

def retrieve_docs(query, bm25, docs):
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    ranked_indices = np.argsort(scores)[::-1]
    return [docs[i] for i in ranked_indices]

qa_pipeline = pipeline("question-answering")

def answer_question(query, retrieved_docs):
    answers = []
    for doc in retrieved_docs:
        result = qa_pipeline(question=query, context=doc)
        answers.append(result['answer'])
    return ' '.join(answers)

# Streamlit UI
def main():
    st.title("Textbook Content Search and Q&A")

    # Input from user
    query = st.text_input("Enter your query:")

    if st.button("Get Answer"):
        if query:
            documents = [
                "The Indian independence movement was a series of historic events with the ultimate aim of ending British rule in India.",
                "The movement began in the 1850s with the first major uprising against British authority."
            ]

            # Process query
            bm25 = initialize_bm25(documents)
            retrieved_docs = retrieve_docs(query, bm25, documents)
            answer = answer_question(query, retrieved_docs)

            # Display results
            st.write("**Retrieved Documents:**")
            for doc in retrieved_docs:
                st.write(f"- {doc}")

            st.write("**Answer:**")
            st.write(answer)
        else:
            st.write("Please enter a query.")

if __name__ == "__main__":
    main()


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
2024-07-21 14:58:18.594 
  command:

    streamlit run c:\Python37\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
