In [None]:
import ipywidgets as widgets
from IPython.display import display
import tempfile

# Create file picker widget accepting PDFs and additional document types.
upload_widget = widgets.FileUpload(
    accept=".pdf,.epub,.mobi,.docx,.txt",  # File types to accept
    multiple=False  # Only one file at a time
)

def handle_upload(change):
    for filename, file_info in upload_widget.value.items():
        content = file_info["content"]
        # Use tempfile to store the uploaded file temporarily
        ext = '.' + filename.split('.')[-1] if '.' in filename else ''
        with tempfile.NamedTemporaryFile(delete=False, prefix="uploaded_", suffix=ext) as temp_file:
            temp_file.write(content)
            print(f"File '{filename}' saved to: {temp_file.name}")

# Observe changes in the widget value; when a file is uploaded, handle it.
upload_widget.observe(handle_upload, names="value")
display(upload_widget)




FileUpload(value=(), accept='.pdf,.epub,.mobi,.docx,.txt', description='Upload')

No file uploaded yet.


In [17]:
import fitz  # PyMuPDF

def extract_pdf_content(file_path):
    doc = fitz.open(file_path)
    content = ""
    structural_elements = []  # list to store detected headings as (text, page number, font size, font name)
    
    # Process each page in the document
    for page_idx in range(len(doc)):
        page = doc[page_idx]
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:
            if b.get("type") == 0:  # this is a text block
                for line in b["lines"]:
                    for span in line["spans"]:
                        text = span["text"]
                        fontsize = span["size"]
                        fontname = span["font"]
                        content += text + " "
                        
                        # Using a naive heuristic to detect headings/chapter titles:
                        # If the font size is larger than 14 or the font indicates bold style
                        if fontsize > 14 or "Bold" in fontname or "bold" in fontname:
                            structural_elements.append((text, page_idx + 1, fontsize, fontname))
                    content += "\n"
    return content, structural_elements

# Example usage:
# Process the file paths stored in uploaded_files if available.
if uploaded_files:
    for filename, file_path in uploaded_files.items():
        if filename.lower().endswith(".pdf"):
            text, headings = extract_pdf_content(file_path)
            print("Extracted text:")
            print(text)
            print("\nInferred Headings/Chapters:")
            for h in headings:
                print("Heading:", h[0], "| Page:", h[1], "| Font Size:", h[2], "| Font:", h[3])
else:
    print("No uploaded files to process.")


No uploaded files to process.


In [1]:
import os
import shutil # For a more concrete "temporary copy" example
import tempfile # For true temporary file creation if needed

def import_document():
    """
    Prompts the user for a document path, validates its extension,
    and conceptually handles it as a temporary upload.
    """
    supported_extensions = ['.pdf', '.txt', '.docx', '.epub', '.mobi']
    
    print("📤 Document Import")
    print("Please enter the full path to your document.")
    print(f"Supported file types: {', '.join(supported_extensions)}")
    
    file_path = input("Enter file path: ").strip()
    
    # Basic validation: Check if path is not empty
    if not file_path:
        print("⚠️ No file path entered. Please try again.")
        return None
        
    # Validate file extension
    file_name, file_extension = os.path.splitext(file_path)
    
    if file_extension.lower() not in supported_extensions:
        print(f"❌ Error: Unsupported file type '{file_extension}'.")
        print(f"Please choose a file with one of the following extensions: {', '.join(supported_extensions)}")
        return None
        
    # Check if the file actually exists
    if not os.path.exists(file_path):
        print(f"❌ Error: File not found at '{file_path}'. Please check the path and try again.")
        return None
        
    if not os.path.isfile(file_path):
        print(f"❌ Error: The path '{file_path}' is a directory, not a file. Please provide a path to a file.")
        return None

    print(f"✅ File selected: {file_path}")
    
    # --- Handling "Temporary Storage" ---
    # Option 1: Use the file directly from its original path (simplest for "immediate processing")
    # The 'file_path' variable now holds the path to the uploaded document.
    # You can pass this path to other functions for processing.
    print(f"📄 The document '{os.path.basename(file_path)}' is ready for immediate processing from its original location.")
    
    # Option 2: Create a temporary copy (if you need to modify it or ensure it's in a specific temp location)
    # This is useful if the original file shouldn't be touched or if you want a sandbox.
    try:
        # Create a temporary directory to store the copy
        temp_dir = tempfile.mkdtemp()
        temp_file_path = os.path.join(temp_dir, os.path.basename(file_path))
        
        # Copy the selected file to the temporary directory
        shutil.copy2(file_path, temp_file_path)
        print(f"COPY: A temporary copy has been made at: {temp_file_path}")
        print("This temporary copy will be used for processing and then the directory will be cleaned up.")
        
        # --- At this point, you would process 'temp_file_path' ---
        # For example:
        # process_document(temp_file_path)
        
        # After processing, you would typically clean up the temporary directory and its contents.
        # For demonstration, we'll just print the path. In a real app, you'd delete it after use.
        # To clean up:
        # shutil.rmtree(temp_dir)
        # print(f"INFO: Temporary directory {temp_dir} would be deleted here.")
        
        # For this example, we return the path to the temporary copy
        return temp_file_path # Or file_path if using Option 1
        
    except Exception as e:
        print(f"❌ Error creating a temporary copy: {e}")
        return None
    
    # If you only need to process and then "forget", file_path is enough.
    # No actual file deletion of the original happens unless explicitly coded.

if __name__ == "__main__":
    uploaded_file_location = import_document()
    
    if uploaded_file_location:
        print(f"\n📝 Processing would happen on: {uploaded_file_location}")
        # Example:
        # content = read_document_content(uploaded_file_location)
        # if content:
        #     print("Successfully processed document content.")

        # If a temporary copy was made (like in Option 2 shown in the function),
        # and you are done with it, you might want to clean it up.
        # The 'temp_dir' from Option 2 in import_document() would be the target for shutil.rmtree().
        # This cleanup logic should be managed carefully, e.g., after all processing is complete.
        # For simplicity in this example, cleanup is commented out within the function.
    else:
        print("\nNo document was imported.")

📤 Document Import
Please enter the full path to your document.
Supported file types: .pdf, .txt, .docx, .epub, .mobi
✅ File selected: Atomic habits ( PDFDrive ).pdf
📄 The document 'Atomic habits ( PDFDrive ).pdf' is ready for immediate processing from its original location.
COPY: A temporary copy has been made at: C:\Users\MEDIAD~1\AppData\Local\Temp\tmpv7rbq7fe\Atomic habits ( PDFDrive ).pdf
This temporary copy will be used for processing and then the directory will be cleaned up.

📝 Processing would happen on: C:\Users\MEDIAD~1\AppData\Local\Temp\tmpv7rbq7fe\Atomic habits ( PDFDrive ).pdf
