In [1]:
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

import ipywidgets as widgets
from IPython.display import display
import tempfile
import os
import re


In [2]:
# User input choice
input_type = widgets.RadioButtons(
    options=["Upload PDF/DOCX File", "Paste Text"],
    value="Upload PDF/DOCX File",
    description="Input Type:",
)

# File uploader widget accepting PDF and DOCX
file_upload = widgets.FileUpload(accept=".pdf,.docx", multiple=False, description="Upload File")

# Text area for raw legal text input
text_input = widgets.Textarea(
    value="",
    placeholder="Paste legal text here...",
    description="Text Input:",
    layout=widgets.Layout(width="100%", height="200px"),
)

# Display widgets based on selection
def on_input_type_change(change):
    if change['new'] == "Upload PDF/DOCX File":
        file_upload.layout.display = 'block'
        text_input.layout.display = 'none'
    else:
        file_upload.layout.display = 'none'
        text_input.layout.display = 'block'

input_type.observe(on_input_type_change, names='value')

display(input_type)
display(file_upload)
display(text_input)

# Initialize widget visibility
on_input_type_change({'new': input_type.value})


RadioButtons(description='Input Type:', options=('Upload PDF/DOCX File', 'Paste Text'), value='Upload PDF/DOCX…

FileUpload(value=(), accept='.pdf,.docx', description='Upload File')

Textarea(value='', description='Text Input:', layout=Layout(height='200px', width='100%'), placeholder='Paste …

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
)

def clean_text(text: str) -> str:
    # Normalize whitespace and strip
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def chunk_documents(docs: list) -> list:
    chunks = []
    for doc in docs:
        splits = text_splitter.split_text(doc.page_content)
        chunks.extend(splits)
    return [Document(page_content=clean_text(chunk)) for chunk in chunks]


In [8]:
def load_pdf(path):
    loader = PyPDFLoader(path)
    return loader.load()

def load_docx(path):
    loader = UnstructuredWordDocumentLoader(path)
    return loader.load()


In [9]:
process_button = widgets.Button(description="Process Input")
output_area = widgets.Output()

display(process_button, output_area)

def on_process_clicked(b):
    with output_area:
        output_area.clear_output()
        
        # Handle pasted text input
        if input_type.value == "Paste Text":
            user_text = text_input.value.strip()
            if not user_text:
                print("❗ Please enter some text!")
                return
            docs = [Document(page_content=user_text)]
            chunks = chunk_documents(docs)
            print(f"✅ Processed text into {len(chunks)} chunks.")
            print(chunks[0].page_content[:500])
        
        # Handle file upload input
        else:
            if not file_upload.value:
                print("❗ Please upload a PDF or DOCX file!")
                return
            
            uploaded_file = next(iter(file_upload.value.values()))
            suffix = "." + uploaded_file['metadata']['name'].split(".")[-1].lower()
            
            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
                tmp_file.write(uploaded_file['content'])
                tmp_path = tmp_file.name
            
            if suffix == ".pdf":
                docs = load_pdf(tmp_path)
            elif suffix == ".docx":
                docs = load_docx(tmp_path)
            else:
                print("❌ Unsupported file type!")
                os.remove(tmp_path)
                return
            
            chunks = chunk_documents(docs)
            print(f"✅ Loaded and chunked {len(chunks)} document chunks.")
            print(chunks[0].page_content[:500])
            
            os.remove(tmp_path)

process_button.on_click(on_process_clicked)

Button(description='Process Input', style=ButtonStyle())

Output()