
 # AI Agent: Textbook to Notebook Notes Generator 
 
This AI agent processes textbook PDFs, extracts content, generates summarized notes using LLaMA3,
 evaluates quality using GPT-4o-mini, and outputs clean PDF notes without markdown symbols.


In [29]:
# Install and set up Ollama with LLaMA3 model
!ollama pull llama3:8b  # Download the LLaMA3 8B model for local use
!ollama serve           # Start the Ollama server to serve the model


Error: accepts 1 arg(s), received 10
Error: accepts 0 arg(s), received 9


In [30]:
# Import required libraries and dependencies
import os
import json
import gradio as gr      # For creating web interface
import textwrap
import asyncio
import threading
from concurrent.futures import ThreadPoolExecutor

# Try importing PyMuPDF for PDF text extraction (alternative to PyPDF2)
try:
    import fitz  # PyMuPDF
except Exception:
    fitz = None

from dotenv import load_dotenv  # For loading environment variables
from openai import AsyncOpenAI   # For interacting with AI models
from pydantic import BaseModel  # For data validation
from fpdf import FPDF, XPos, YPos  # For PDF generation with modern positioning
from datetime import datetime
import re  # For regular expressions to clean text


In [31]:
# Load environment variables from .env file
# This contains API keys for OpenAI and Google services
load_dotenv(override=True)


True

In [32]:
# Retrieve and validate API keys from environment variables
openai_api_key = os.getenv("OPENAI_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")

# Check if OpenAI API key is available
if openai_api_key:
    print(f"OpenAI API Key found, starting with: {openai_api_key[:8]}...")
else:
    print("OpenAI API Key not found. Please set it in your .env file.")

# Check if Google API key is available
if google_api_key:
    print(f"Google API Key found, starting with: {google_api_key[:8]}...")
else:
    print("Google API Key not found. Please set it in your .env file.")

OpenAI API Key found, starting with: sk-proj-...
Google API Key found, starting with: AIzaSyDn...


In [33]:
# Initialize AI clients for different services

# Ollama client for local LLaMA3 model
ollama_client = AsyncOpenAI(
    base_url="http://localhost:11434/v1",  # Local Ollama server
    api_key="ollama"  # Dummy API key for Ollama
)

# GPT-4o-mini client for quality evaluation (using OpenAI API)
gpt_eval_client = AsyncOpenAI(
    api_key=openai_api_key,
    # Using default OpenAI base URL for GPT-4o-mini
)

In [34]:
# Define data model for evaluation results using Pydantic
# This ensures structured response from the quality evaluator
class Evaluation(BaseModel):
    """Defines the structure for quality evaluation output."""
    is_acceptable: bool  # Whether the notes meet quality standards
    feedback: str        # Detailed feedback for improvement

In [35]:
async def generate_notes(text_chunk: str, retries=2, feedback="") -> str:
    """
    Generate structured notes in clean text format without markdown symbols using LLaMA3.
    
    This function:
    1. Sends text chunks to LLaMA3 model for summarization
    2. Explicitly instructs the model to avoid markdown symbols
    3. Cleans any remaining markdown from the output
    4. Retries with feedback if quality evaluation fails
    
    Args:
        text_chunk (str): The portion of textbook text to summarize
        retries (int): Number of retry attempts if quality check fails
        feedback (str): Feedback from previous evaluation to improve notes
    
    Returns:
        str: Clean, formatted notes without markdown symbols
    """
    # System prompt that explicitly forbids markdown and requests clean text
    system_prompt = (
        "You are an expert academic assistant. "
        "Produce clear, well-structured notes WITHOUT any markdown formatting. "
        "DO NOT use #, *, -, **, or any other markdown symbols. "
        "Use plain text with clear headings, bullet points using simple indentation, and concise summaries. "
        "Focus on key ideas, definitions, and concepts from the provided text. "
        "Format the content in a way that will look good when converted to PDF."
    )

    # Build user prompt with optional feedback for retries
    if feedback:
        user_prompt = (
            f"Improve the previous notes using this feedback:\n{feedback}\n\n"
            f"Original text:\n{text_chunk}\n\n"
            f"IMPORTANT: Do not use any markdown symbols like #, *, -, ** in your response."
        )
    else:
        user_prompt = (
            f"Generate concise academic notes in clean text format for the following text:\n{text_chunk}\n\n"
            f"IMPORTANT: Do not use any markdown symbols like #, *, -, ** in your response. Use plain text only."
        )

    # Prepare messages for the AI model
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    try:
        # Send request to LLaMA3 model via Ollama
        response = await ollama_client.chat.completions.create(
            model="llama3:8b",
            messages=messages,
        )
        notes = response.choices[0].message.content
        # Clean any remaining markdown symbols that might have been generated
        notes = clean_markdown_symbols(notes)
    except Exception as e:
        print(f"❌ Ollama generation error: {e}")
        return f"Error generating notes: {e}"

    # Quality control: Evaluate notes and retry if needed
    if retries > 0:
        evaluation = await evaluate_notes(text_chunk, notes)
        if not evaluation.is_acceptable:
            print(f"🔁 Retrying with feedback: {evaluation.feedback}")
            return await generate_notes(text_chunk, retries - 1, evaluation.feedback)
        else:
            print("✅ Notes passed evaluation.")
    return notes

In [36]:
def clean_markdown_symbols(text: str) -> str:
    """
    Comprehensive markdown symbol removal function.
    
    This function uses regular expressions to systematically remove all markdown
    formatting symbols while preserving the actual content.
    
    Args:
        text (str): Text containing markdown symbols
        
    Returns:
        str: Clean text without markdown symbols
    """
    # Remove headers (#, ##, ###, etc.)
    text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
    
    # Remove bold formatting: **bold** becomes bold
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    
    # Remove italic formatting: *italic* becomes italic
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    
    # Remove underscore italic: _italic_ becomes italic
    text = re.sub(r'_(.*?)_', r'\1', text)
    
    # Remove unordered list markers (-, *, +) but keep indentation
    text = re.sub(r'^[\*\-+]\s+', '  ', text, flags=re.MULTILINE)
    
    # Remove numbered list markers (1., 2., etc.) but keep indentation
    text = re.sub(r'^\d+\.\s+', '  ', text, flags=re.MULTILINE)
    
    # Remove blockquote markers (>)
    text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
    
    # Remove inline code markers (`code`)
    text = re.sub(r'`{1,3}(.*?)`{1,3}', r'\1', text)
    
    # Remove horizontal rules (---, ***)
    text = re.sub(r'^[\*\-_]{3,}\s*$', '', text, flags=re.MULTILINE)
    
    # Clean up excessive empty lines for better formatting
    text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
    
    return text.strip()

In [37]:
async def evaluate_notes(text_chunk: str, notes: str) -> Evaluation:
    """
    Evaluate the quality of generated notes using GPT-4o-mini.
    
    This function:
    1. Sends original text and generated notes to GPT-4o-mini for evaluation
    2. Checks for accuracy, clarity, and absence of markdown
    3. Returns structured evaluation with feedback
    
    Args:
        text_chunk (str): Original textbook text
        notes (str): Generated notes to evaluate
        
    Returns:
        Evaluation: Structured evaluation result with acceptability and feedback
    """
    prompt = (
        "You are a quality evaluator. Check if the notes correctly and clearly summarize the text "
        "and are well-structured for PDF format. Also check that they don't contain markdown symbols. "
        "Respond in JSON with keys: is_acceptable (bool) and feedback (string).\n\n"
        f"--- Original Text ---\n{text_chunk}\n\n"
        f"--- Notes ---\n{notes}"
    )

    try:
        # Send evaluation request to GPT-4o-mini
        response = await gpt_eval_client.chat.completions.create(
            model="gpt-4o-mini",  # Changed to GPT-4o-mini
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"},  # Request JSON response for easy parsing
        )
        # Parse JSON response into Evaluation model
        data = json.loads(response.choices[0].message.content)
        return Evaluation(**data)
    except Exception as e:
        print(f"⚠️ GPT-4o-mini evaluation error: {e}")
        # If evaluation fails, accept the notes to avoid blocking the process
        return Evaluation(is_acceptable=True, feedback=f"Evaluation failed: {e}")

In [38]:
def chunk_text(text: str, max_chars: int = 2500) -> list:
    """
    Split long text into manageable chunks for processing.
    
    Large textbooks are broken into smaller chunks because:
    - AI models have token limits
    - Better quality summaries for focused sections
    - Prevents timeout and memory issues
    
    Args:
        text (str): Full textbook text
        max_chars (int): Maximum characters per chunk
        
    Returns:
        list: List of text chunks
    """
    return textwrap.wrap(text, width=max_chars, break_long_words=False, replace_whitespace=False)

In [39]:
def sanitize_text(text: str) -> str:
    """
    Replace problematic Unicode characters with ASCII equivalents.
    
    FPDF has limited Unicode support, so we replace characters that cause issues:
    - Bullets, dashes, quotes, mathematical symbols, etc.
    
    Args:
        text (str): Text containing potential problematic characters
        
    Returns:
        str: Text with safe ASCII replacements
    """
    # Dictionary mapping Unicode characters to ASCII equivalents
    replacements = {
        '•': '-',      # bullet to hyphen
        '–': '-',      # en dash to hyphen
        '—': '-',      # em dash to hyphen
        '“': '"',      # left double quote to straight quote
        '”': '"',      # right double quote to straight quote
        '‘': "'",      # left single quote to straight quote
        '’': "'",      # right single quote to straight quote
        '…': '...',    # ellipsis to three dots
        '→': '->',     # right arrow to ASCII
        '←': '<-',     # left arrow to ASCII
        '≥': '>=',     # greater than or equal to ASCII
        '≤': '<=',     # less than or equal to ASCII
        '×': 'x',      # multiplication sign to letter x
        '÷': '/',      # division sign to slash
        '±': '+/-',    # plus-minus to ASCII representation
    }
    
    # Apply all replacements
    for unicode_char, ascii_char in replacements.items():
        text = text.replace(unicode_char, ascii_char)
    
    return text

In [40]:
def create_pdf_file(notes_text: str, source_filename: str) -> str:
    """
    Convert clean text notes into a professionally formatted PDF document.
    
    This function:
    1. Creates a PDF with proper styling and formatting
    2. Uses intelligent heading detection
    3. Applies consistent typography and spacing
    4. Handles different content types (headings, paragraphs, lists)
    
    Args:
        notes_text (str): Clean notes text without markdown
        source_filename (str): Original textbook filename for naming output
        
    Returns:
        str: Path to the generated PDF file
    """
    # Generate output filename based on source textbook
    title = os.path.splitext(os.path.basename(source_filename))[0].replace('_', ' ').title()
    output_filename = f"{os.path.splitext(source_filename)[0]}_notes.pdf"

    # Initialize PDF document with auto page break
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    
    # Add first page
    pdf.add_page()
    
    # Title section - centered and bold
    pdf.set_font("helvetica", "B", 16)
    pdf.cell(0, 10, f"Academic Notes: {title}", new_x=XPos.LMARGIN, new_y=YPos.NEXT, align="C")
    pdf.ln(5)
    
    # Metadata section - italic and centered
    pdf.set_font("helvetica", "I", 10)
    pdf.cell(0, 8, f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M')}", new_x=XPos.LMARGIN, new_y=YPos.NEXT, align="C")
    pdf.ln(15)
    
    # Process notes content line by line
    pdf.set_font("helvetica", "", 11)
    lines = notes_text.split('\n')
    
    for line in lines:
        line = line.strip()
        if not line:
            pdf.ln(5)  # Add space for empty lines
            continue
            
        # Sanitize line to remove problematic Unicode characters
        safe_line = sanitize_text(line)
        
        # Intelligent heading detection based on text patterns
        is_heading = (
            safe_line.isupper() or                    # ALL CAPS text
            safe_line.endswith(':') or                # Ends with colon
            len(safe_line) < 50 and not safe_line.startswith('  ')  # Short line without indentation
        )
        
        # Apply appropriate formatting based on content type
        if is_heading and len(safe_line) > 3:
            # Main heading - bold and larger font
            pdf.set_font("helvetica", "B", 14)
            pdf.cell(0, 10, safe_line, new_x=XPos.LMARGIN, new_y=YPos.NEXT)
            pdf.set_font("helvetica", "", 11)
            pdf.ln(3)
        elif safe_line.startswith('  ') and not safe_line.startswith('   '):
            # Sub-heading or major bullet point - bold and medium font
            pdf.set_font("helvetica", "B", 12)
            clean_line = safe_line.strip()
            pdf.cell(0, 8, clean_line, new_x=XPos.LMARGIN, new_y=YPos.NEXT)
            pdf.set_font("helvetica", "", 11)
            pdf.ln(2)
        elif safe_line.startswith('    ') or safe_line.startswith('   '):
            # Indented content (sub-bullet points) - normal font with indentation
            pdf.set_font("helvetica", "", 11)
            clean_line = safe_line.strip()
            pdf.cell(10)  # Add indentation
            pdf.multi_cell(0, 5, f"- {clean_line}")  # Multi-cell for text wrapping
            pdf.ln(1)
        else:
            # Regular paragraph - normal font and multi-cell for wrapping
            pdf.set_font("helvetica", "", 11)
            pdf.multi_cell(0, 5, safe_line)
            pdf.ln(2)
    
    # Add footer with source information
    pdf.set_y(-15)
    pdf.set_font("helvetica", "I", 8)
    pdf.cell(0, 10, f"Generated from: {os.path.basename(source_filename)}", align="C")
    
    try:
        # Generate PDF file
        pdf.output(output_filename)
        print(f"✅ PDF created: {output_filename}")
        return output_filename
    except Exception as e:
        print(f"⚠️ PDF creation error: {e}")
        # Fallback: Create text file if PDF generation fails
        fallback = f"{os.path.splitext(source_filename)[0]}_notes.txt"
        with open(fallback, "w", encoding="utf-8") as f:
            f.write(f"Notes for {title}\n")
            f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n\n")
            f.write(notes_text)
        print(f"Saved fallback text file: {fallback}")
        return fallback

In [41]:
def run_async_in_thread(async_func, *args):
    """
    Run asynchronous functions in a separate thread to avoid event loop conflicts.
    
    Gradio runs in its own event loop, so we need to run async AI functions
    in separate threads to prevent conflicts.
    
    Args:
        async_func: Asynchronous function to execute
        *args: Arguments to pass to the function
        
    Returns:
        The result of the async function
    """
    # Create new event loop for this thread
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    try:
        # Run the async function and return result
        return loop.run_until_complete(async_func(*args))
    finally:
        # Clean up the event loop
        loop.close()

In [42]:
def process_textbook_sync(file, progress=gr.Progress()):
    """
    Main textbook processing pipeline - synchronous wrapper for Gradio compatibility.
    
    This function coordinates the entire process:
    1. Text extraction from PDF
    2. Chunking for processing
    3. AI-powered note generation
    4. Quality evaluation
    5. PDF creation
    
    Args:
        file: Uploaded PDF file object from Gradio
        progress: Gradio progress tracker for UI updates
        
    Returns:
        tuple: Status message and path to generated PDF
    """
    if file is None:
        return "Please upload a textbook to begin.", None

    try:
        from PyPDF2 import PdfReader
        
        pdf_file_path = file.name
        reader = PdfReader(pdf_file_path)
        num_pages = len(reader.pages)

        # Step 1 — Text Extraction
        progress(0, desc="Step 1/4: Extracting text...")
        full_text = ""
        for i, page in enumerate(reader.pages):
            progress((i + 1) / num_pages, desc=f"Extracting Page {i + 1}/{num_pages}")
            page_text = page.extract_text()
            if page_text:
                full_text += page_text + "\n"

        if not full_text.strip():
            return "⚠️ No text extracted from the PDF.", None

        print(f"✅ Extracted {len(full_text)} characters from {num_pages} pages.")

        # Step 2 — Text Chunking and Note Generation
        chunks = chunk_text(full_text)
        num_chunks = len(chunks)
        all_notes = []

        progress(0, desc="Step 2/4: Generating notes...")
        for i, chunk in enumerate(chunks):
            progress((i + 1) / num_chunks, desc=f"Generating Chunk {i + 1}/{num_chunks}")
            # Run async note generation in thread
            notes_chunk = run_async_in_thread(generate_notes, chunk)
            # Clean any remaining markdown from generated notes
            clean_notes = clean_markdown_symbols(notes_chunk)
            all_notes.append(clean_notes)

        # Combine all notes chunks
        combined_notes = "\n\n".join(all_notes)

        # Final cleanup pass for any remaining markdown
        combined_notes = clean_markdown_symbols(combined_notes)

        # Step 3 — PDF Creation
        progress(1, desc="Step 3/4: Creating PDF...")
        pdf_output_path = create_pdf_file(combined_notes, pdf_file_path)

        # Step 4 — Return Results
        if os.path.exists(pdf_output_path):
            message = f"✅ Notes generated successfully!\n\n**Saved as:** {os.path.basename(pdf_output_path)}"
            return message, pdf_output_path
        else:
            return "❌ PDF generation failed.", None
            
    except ImportError:
        return "❌ PyPDF2 not installed. Please install it using: pip install PyPDF2", None
    except Exception as e:
        return f"❌ Error processing PDF: {str(e)}", None

In [43]:
def create_notes_interface(file, progress=gr.Progress(track_tqdm=True)):
    """
    Gradio interface function - must be synchronous to avoid event loop issues.
    
    This function is called by Gradio when user uploads a file.
    It coordinates the entire processing pipeline.
    
    Args:
        file: Uploaded file from Gradio
        progress: Progress tracker for UI updates
        
    Returns:
        tuple: Status message and output file
    """
    if file is not None:
        return process_textbook_sync(file, progress)
    return "Please upload a textbook.", None

In [44]:
def install_dependencies():
    """
    Install required Python packages if not already available.
    
    This ensures all necessary dependencies are present:
    - PyPDF2 for PDF text extraction
    - fpdf2 for PDF generation
    """
    try:
        import PyPDF2
    except ImportError:
        print("Installing PyPDF2...")
        os.system("pip install PyPDF2")
    
    # Check fpdf2 version for compatibility
    try:
        import fpdf
        print(f"fpdf2 version: {fpdf.__version__}")
    except ImportError:
        print("Installing fpdf2...")
        os.system("pip install fpdf2")

# Install dependencies at startup
install_dependencies()

fpdf2 version: 2.8.5


In [45]:
# Create the Gradio web interface
iface = gr.Interface(
    fn=create_notes_interface,  # Main processing function
    inputs=gr.File(label="📘 Upload Textbook (PDF)"),  # File upload input
    outputs=[
        gr.Markdown(label="🧾 Status / Summary"),  # Status message output
        gr.File(label="📥 Download Generated Notes (.pdf)")  # Downloadable PDF output
    ],
    title="AI Textbook → Notebook Notes Generator",
    description=(
        "Upload any textbook (PDF). The local LLaMA 3 (8B) model summarizes it into clean academic notes "
        "without markdown symbols. GPT-4o-mini evaluates quality, and the output is auto-saved as a clean PDF."
    ),
    allow_flagging="never"  # Disable Gradio's flagging feature
)

if __name__ == "__main__":
    # Startup message and launch the web interface
    print("🚀 Starting Textbook to Notes Generator...")
    print("📖 Upload a PDF textbook to generate clean summarized notes")
    
    # Launch Gradio interface
    iface.launch(
        server_name="127.0.0.1",  # Localhost only for security
        share=False,              # Don't create public link
    )



🚀 Starting Textbook to Notes Generator...
📖 Upload a PDF textbook to generate clean summarized notes
* Running on local URL:  http://127.0.0.1:7890
* To create a public link, set `share=True` in `launch()`.


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "c:\Upendra\Git Hub\Git Hub -- K-Upendra-7\abcd-agentic-training-vnr-upendra\AI-Agent-Textbook-Notebook\.venv\Lib\site-packages\uvicorn\protocols\http\h11_impl.py", line 403, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        self.scope, self.receive, self.send
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Upendra\Git Hub\Git Hub -- K-Upendra-7\abcd-agentic-training-vnr-upendra\AI-Agent-Textbook-Notebook\.venv\Lib\site-packages\uvicorn\middleware\proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Upendra\Git Hub\Git Hub -- K-Upendra-7\abcd-agentic-training-vnr-upendra\AI-Agent-Textbook-Notebook\.venv\Lib\site-packages\fastapi\applications.py", line 1133, in __call__
    await super().__call__(scope, rec

✅ Extracted 19920 characters from 11 pages.
⚠️ GPT-4o-mini evaluation error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-MkjmLPCmCH96jd8YXEY5SEUP on tokens per min (TPM): Limit 100000, Used 99098, Requested 1076. Please try again in 1h15m10.08s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
✅ Notes passed evaluation.
⚠️ GPT-4o-mini evaluation error: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-MkjmLPCmCH96jd8YXEY5SEUP on requests per min (RPM): Limit 3, Used 3, Requested 1. Please try again in 20s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/acc