In [5]:
!pip install PyPDF2 nltk gensim transformers sentence-transformers pyserini streamlit gradio



In [6]:
# Import necessary modules
import PyPDF2
import nltk
import gensim
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util
import pyserini
import streamlit as st
import gradio as gr
import os
import json

**Textbook Selection and Content Extraction:**

In [7]:
from google.colab import files
import io

# Upload PDFs
uploaded = files.upload()

# Extract text from PDFs
def extract_text_from_pdfs(uploaded_files):
    texts = {}
    for filename, file_content in uploaded_files.items():
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        texts[filename] = text
    return texts

texts = extract_text_from_pdfs(uploaded)


Saving [Data] Python Data Science Handbook (2017).pdf to [Data] Python Data Science Handbook (2017).pdf
Saving [ML] Introduction to Machine Learning with Python (2017) (1).pdf to [ML] Introduction to Machine Learning with Python (2017) (1).pdf
Saving [Python] Fluent Python (2015).pdf to [Python] Fluent Python (2015).pdf


**Hierarchical Tree-based Indexing:**

In [8]:
# Analyzing structure and creating a hierarchical index
def create_hierarchical_index(texts):
    hierarchical_index = {}
    for title, text in texts.items():
        chapters = text.split("Chapter")
        book_index = {"title": title, "chapters": []}
        for i, chapter in enumerate(chapters[1:], 1):
            sections = chapter.split("Section")
            chapter_index = {"chapter": i, "sections": []}
            for j, section in enumerate(sections[1:], 1):
                subsections = section.split("Subsection")
                section_index = {"section": j, "subsections": subsections}
                chapter_index["sections"].append(section_index)
            book_index["chapters"].append(chapter_index)
        hierarchical_index[title] = book_index
    return hierarchical_index

hierarchical_index = create_hierarchical_index(texts)

# Store the index in JSON format for efficient retrieval
with open('hierarchical_index.json', 'w') as f:
    json.dump(hierarchical_index, f)


**Retrieval Techniques**

In [1]:
!pip install pyserini==0.16.0



In [2]:
!pip install faiss-cpu



In [7]:
# Query expansion using NLTK and Gensim
nltk.download('wordnet')
from nltk.corpus import wordnet

def expand_query(query):
    synonyms = set()
    for word in query.split():
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name())
    return list(synonyms)

# Implement BM25 using Pyserini
from pyserini.search import SimpleSearcher

def bm25_search(query, index_dir='indexes'):
    searcher = SimpleSearcher(index_dir)
    hits = searcher.search(query)
    results = []
    for i in range(len(hits)):
        results.append(hits[i].raw)
    return results

# Implement BERT-based retrieval using SentenceTransformers
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

def bert_search(query, documents):
    query_embedding = model.encode(query, convert_to_tensor=True)
    doc_embeddings = model.encode(documents, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(query_embedding, doc_embeddings)[0]
    results = [documents[i] for i in scores.argsort(descending=True)]
    return results

# Combining BM25 and BERT
def hybrid_search(query, index_dir='indexes', documents=None):
    bm25_results = bm25_search(query, index_dir)
    bert_results = bert_search(query, documents or bm25_results)
    return bert_results


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

**Multi-document/Topic/Section-based RAG:**

In [8]:
# Develop RAG system
def rag_system(query, hierarchical_index):
    expanded_query = expand_query(query)
    relevant_docs = hybrid_search(expanded_query)

    # Traverse hierarchical index to find relevant sections
    relevant_sections = []
    for doc in relevant_docs:
        for title, book_index in hierarchical_index.items():
            if title in doc:
                for chapter in book_index['chapters']:
                    for section in chapter['sections']:
                        for subsection in section['subsections']:
                            if any(word in subsection for word in expanded_query):
                                relevant_sections.append(subsection)

    return relevant_sections


**Question Answering:**

In [9]:
from transformers import pipeline

# Initialize QA pipeline
qa_pipeline = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad')

def question_answering(query, context):
    return qa_pipeline(question=query, context=context)




config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

**User Interface:**

In [10]:
# Streamlit or Gradio UI
def main():
    st.title("Textbook Content Retrieval and QA System")

    query = st.text_input("Enter your query:")
    if st.button("Search"):
        if query:
            with open('hierarchical_index.json') as f:
                hierarchical_index = json.load(f)
            relevant_sections = rag_system(query, hierarchical_index)
            st.write("Relevant Sections:")
            for section in relevant_sections:
                st.write(section)

            context = " ".join(relevant_sections)
            answer = question_answering(query, context)
            st.write("Answer:")
            st.write(answer['answer'])

if __name__ == "__main__":
    main()


2024-07-24 11:00:21.414 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2024-07-24 11:00:21.417 Session state does not function when running a script without `streamlit run`
