<a href="https://colab.research.google.com/github/Jayavarshini1711/Q-Gen/blob/main/QGen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qU pdf4llm pymupdf transformers accelerate bitsandbytes tantivy gradio lancedb==0.20.0


### Libraries and Their Uses

| Library/Tool        | Use Case in Questify                                                                       |
|---------------------|----------------------------------------------------------------------|
| pdf4llm           | Extracts structured content (text, images, tables) from PDFs         |
| transformers      | Loads and runs LLMs (e.g., Mistral for question generation, BERT for classification) |
| accelerate        | Speeds up model inference across GPU/CPU environments                |
| bitsandbytes      | Enables low-bit quantization for memory-efficient LLMs               |
| lancedb           | Stores SBERT embeddings for hybrid search of study content           |
| tantivy           | Provides fast keyword-based full-text indexing and search            |


| Model Name                     | Type                          | Layers | Max Seq Length | Use Case                              | Labels / Output      |
|-------------------------------|-------------------------------|--------|----------------|----------------------------------------|-----------------------|
| Mistral-7B-Instruct-v0.3      | MistralForCausalLM            | 32     | 32,768         | Question Generation                    | Text (Generated Qs)   |
| all-MiniLM-L6-v2              | BertModel                     | 6      | 512            | Sentence Embeddings for Retrieval      | Embeddings            |
| ms-marco-TinyBERT-L6          | BertForSequenceClassification | 6      | 512            | Passage Reranking                      | Relevance Score       |
| cip29/blooms_bert             | BertForSequenceClassification | 12     | 512            | Bloom’s Taxonomy Classification        | 6 Bloom’s Levels      |


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BertTokenizer, BertForSequenceClassification, BitsAndBytesConfig

#Enable 4-bit Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4-bit precision
    bnb_4bit_compute_dtype="float16",  # Use float16 for faster computation
    bnb_4bit_use_double_quant=True,  # Improves efficiency
    bnb_4bit_quant_type="nf4"  # NF4 quantization for better accuracy
)

#Load Tokenizer & Model with Quantization
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

mistral_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    quantization_config=bnb_config,
    device_map="cuda"  # Automatically assigns model to GPU
)

#Load Bloom’s Taxonomy BERT Model
blooms_model_name = "cip29/blooms_bert"
blooms_tokenizer = BertTokenizer.from_pretrained(blooms_model_name)
blooms_model = BertForSequenceClassification.from_pretrained(blooms_model_name, num_labels=6).to("cuda")

In [None]:
!rm -rf /content/*.pdf

import pdf4llm
from google.colab import files
import numpy as np
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from transformers import AutoTokenizer

# Connect to LanceDB
db = lancedb.connect("/content")

# Initialize SBERT Embedder
embedder = get_registry().get("huggingface").create(
    name='sentence-transformers/all-MiniLM-L6-v2',
    device="cuda"
)

# Load tokenizer to chunk text
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')


class PDFSchema(LanceModel):
    text: str = embedder.SourceField()              # chunk text (embedding input)
    vector: Vector(embedder.ndims()) = embedder.VectorField()
    page_name: str                                  # image/visual ID
    full_text: str                                   # full page text for reference
    page: int                                        # page number to detect duplicates



# Upload PDFs
uploaded = files.upload()
print(list(uploaded.keys()))


In [None]:
# Define chunking parameters
CHUNK_SIZE = 480
OVERLAP = 64

# Function to process and chunk text with a sliding window
def split_text_into_chunks(text, page_path, full_text, page_number):
    input_ids = tokenizer.encode(text, truncation=False, add_special_tokens=False)
    chunks = []

    for i in range(0, len(input_ids), CHUNK_SIZE - OVERLAP):
        chunk_ids = input_ids[i:i + CHUNK_SIZE]

        if len(chunk_ids) < 10:  # Skip very small chunks
            continue

        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        chunk_name = f"{page_path}_chunk_{i // (CHUNK_SIZE - OVERLAP) + 1}"

        chunks.append({
            "text": chunk_text,
            "page_name": chunk_name,
            "full_text": full_text,
            "page": page_number
        })

    return chunks

# Collect all entries
entries = []

# Process each uploaded file
uploaded_files = list(uploaded.keys())[:2]  # Limiting to 2 files for processing
for pdf_filename in uploaded_files:
    print(f"\nProcessing: {pdf_filename}")

    # Ask for page numbers or ranges
    page_input = input(f"Enter pages or ranges for {pdf_filename} (e.g., 1,3-5,7): ")

    # Parse user input into zero-based page indices
    selected_pages = []
    for part in page_input.split(","):
        part = part.strip()
        if "-" in part:
            start, end = map(int, part.split("-"))
            selected_pages.extend(range(start - 1, end))  # Convert to zero-based index
        else:
            selected_pages.append(int(part) - 1)

    # Extract specified pages
    selected_page_data = pdf4llm.to_markdown(pdf_filename, page_chunks=True, pages=selected_pages)

    # Process each page
    for page_data in selected_page_data:
        full_text = page_data["text"]
        page_path = page_data["metadata"]["file_path"]
        page_number = page_data["metadata"]["page"]

        if not full_text.strip():  # Skip empty pages
            continue

        # Split text into overlapping chunks with full page context
        chunks = split_text_into_chunks(full_text, page_path, full_text, page_number)

        # Add chunks to entries
        entries.extend(chunks)

# Store all entries in LanceDB
tbl = db.create_table("pdf_data", schema=PDFSchema, mode="overwrite")
tbl.add(entries)

print("\nAll selected pages have been chunked and stored in LanceDB with full page context!")


In [None]:
from lancedb.rerankers import CrossEncoderReranker

# Initialize reranker
reranker = CrossEncoderReranker()

# User query
query = input("\nEnter your query: ")

# Create full-text search index on the 'text' field
tbl.create_fts_index("text", replace=True)

# Search and rerank
results = tbl.search(query, query_type="hybrid").rerank(reranker=reranker).limit(5).to_list()

# Dictionary to hold unique pages
unique_pages = {}

# Filter out duplicates using the 'page' key
for res in results:
    page_number = res.get("page")
    if page_number not in unique_pages:
        unique_pages[page_number] = res["full_text"]

# Final list of unique full_text values with page number
final_full_texts = [{"page": page, "text": text} for page, text in unique_pages.items()]

# Optional: Display them
print("\nUnique full_text entries by page:\n")
for i, entry in enumerate(final_full_texts, 1):
    print(f"[{i}] Page {entry['page']}:\n{entry['text'][:500]}...\n{'-'*80}")


In [None]:
import re
import csv
import torch
import torch.nn.functional as F
from google.colab import files

# Bloom's Taxonomy Labels
bloom_labels = {
    0: "BT1 (Remembering)",
    1: "BT2 (Understanding)",
    2: "BT3 (Applying)",
    3: "BT4 (Analyzing)",
    4: "BT5 (Evaluating)",
    5: "BT6 (Creating)"
}

# Utility functions
def extract_numbered_list(text):
    items = re.split(r'\n(?=\d+\.\s)', text.strip())
    return [item.strip() for item in items if item.strip()]

def split_number_and_text(item):
    match = re.match(r'^(\d+)\.\s+(.*)', item, re.DOTALL)
    if match:
        return int(match.group(1)), match.group(2).strip()
    return None, item.strip()

# Mistral Question Generation
def generate_questions_with_mistral_bulk(pages_text, user_query):
    prompt = f"""
You are an AI assistant specialized in question generation.
Your goal is to generate insightful questions based on the given context and user query.

Context:
{pages_text}

User Query (Focus Topic): {user_query}

### Reasoning:
- Step 1: Identify key points and concepts from the context relevant to the query
- Step 2: Consider what types of questions best explore the topic of interest
- Step 3: Formulate meaningful and topic-specific questions

### Questions:
"""
    inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")
    output = mistral_model.generate(
        **inputs,
        max_new_tokens=2048,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=mistral_tokenizer.eos_token_id
    )
    generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text.split("### Questions:")[-1].strip()

# Mistral Answer Generation
def generate_answer_key_with_mistral(questions_output, context):
    prompt = f"""
You are an AI assistant specialized in answering technical questions.

Context:
{context}

Questions:
{questions_output}

### Answer Key:
"""
    inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to("cuda")
    output = mistral_model.generate(
        **inputs,
        max_new_tokens=4096,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=mistral_tokenizer.eos_token_id
    )
    generated_text = mistral_tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text.split("### Answer Key:")[-1].strip()

# Bloom’s Classifier
def classify_blooms_taxonomy(question):
    inputs = blooms_tokenizer(question, return_tensors="pt", truncation=True, padding=True).to("cuda")
    with torch.no_grad():
        outputs = blooms_model(**inputs)
    probs = F.softmax(outputs.logits, dim=1).squeeze().tolist()
    predicted_idx = torch.argmax(outputs.logits, dim=1).item()
    predicted_label = bloom_labels[predicted_idx]
    prob_dict = {bloom_labels[i]: round(probs[i], 4) for i in range(6)}
    return predicted_label, prob_dict



def bulk_process_pages(pages_array, query, batch_size, output_csv="final_questions_classified.csv"):
    all_qa_pairs = []

    for i in range(0, len(pages_array), batch_size):
        batch_pages = pages_array[i:i+batch_size]
        combined_text = "\n".join(page["text"] for page in batch_pages)
        print(f"\n📄 Processing Pages {i+1} to {min(i+batch_size, len(pages_array))}...")

        # Generate questions and answers
        questions = generate_questions_with_mistral_bulk(combined_text, query)
        answers = generate_answer_key_with_mistral(questions_output=questions, context=combined_text)

        question_items = extract_numbered_list(questions)
        answer_items = extract_numbered_list(answers)

        for q_item, a_item in zip(question_items, answer_items):
            q_num, q_text = split_number_and_text(q_item)
            a_num, a_text = split_number_and_text(a_item)
            if q_num == a_num:
                all_qa_pairs.append((q_num, q_text, a_text))
            else:
                print(f"⚠️ Mismatch: Question {q_num} doesn't match Answer {a_num}")

    # Write classified Q&A to single CSV
    with open(output_csv, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([
            "No", "Question","Bloom's Taxonomy Level", "Answer",
            "BT1 (Remembering)", "BT2 (Understanding)", "BT3 (Applying)",
            "BT4 (Analyzing)", "BT5 (Evaluating)", "BT6 (Creating)"
        ])

        for idx, (num, question, answer) in enumerate(all_qa_pairs, 1):
            bloom_level, probs = classify_blooms_taxonomy(question)
            writer.writerow([
                idx, question, bloom_level, answer,
                probs["BT1 (Remembering)"], probs["BT2 (Understanding)"],
                probs["BT3 (Applying)"], probs["BT4 (Analyzing)"],
                probs["BT5 (Evaluating)"], probs["BT6 (Creating)"]
            ])

    print(f"\n✅ Saved {len(all_qa_pairs)} classified Q&A pairs to '{output_csv}'")
    files.download(output_csv)




In [None]:

query = input("\nEnter your query: ")
bulk_process_pages(final_full_texts, query, batch_size=5)