In [3]:
!pip install transformers PyPDF2



In [5]:
# ------------------------------------------------------------------
# LEGAL CLAUSE FINDER (Unit 1 Project) - PDF/TXT + GUI
# ------------------------------------------------------------------
# Prerequisites:
# You must install the required libraries before running this code.
# Run this in your terminal or Colab cell:
# !pip install transformers PyPDF2
# ------------------------------------------------------------------

from transformers import pipeline
from PyPDF2 import PdfReader
from pathlib import Path
from io import BytesIO
import ipywidgets as widgets
from IPython.display import display

# 1. Load the Pipeline (SQuAD2-style model handles unanswerable questions)
print("Loading model... please wait.")
qa_pipeline = pipeline(
    "question-answering",
    model="deepset/roberta-large-squad2",
    tokenizer="deepset/roberta-large-squad2"
)

# 2. File Reading
def read_pdf_bytes(data: bytes) -> str:
    """Extract text from all pages of a PDF byte stream."""
    reader = PdfReader(BytesIO(data))
    pages_text = []
    for page in reader.pages:
        text = page.extract_text() or ""
        pages_text.append(text)
    return "\n".join(pages_text).strip()

def read_txt_bytes(data: bytes) -> str:
    """Read text from TXT bytes."""
    return data.decode("utf-8", errors="ignore").strip()

def read_document_bytes(filename: str, data: bytes) -> str:
    """Route to PDF or TXT reader based on extension."""
    suffix = Path(filename).suffix.lower()
    if suffix == ".pdf":
        return read_pdf_bytes(data)
    if suffix == ".txt":
        return read_txt_bytes(data)
    return ""

# 3. QA Logic
def find_legal_clause(contract_text: str, question: str) -> str:
    """
    Takes a contract snippet and a question, finds the answer,
    and filters for confidence.
    """
    if not contract_text or not question:
        return "Please provide both contract text and a question."

    result = qa_pipeline(question=question, context=contract_text)
    answer = result.get("answer", "")
    score = round(result.get("score", 0.0) * 100, 2)

    if answer.strip() == "":
        return "No answer found in the document."

    # Simple confidence message
    if score < 10:
        confidence_msg = "⚠️ Low Confidence: The model is unsure. Please verify manually."
    elif score > 80:
        confidence_msg = "✅ High Confidence"
    else:
        confidence_msg = "ℹ️ Moderate Confidence"

    return f"Answer: {answer}\n\nConfidence: {score}%\n{confidence_msg}"

# 4. GUI Flow
upload = widgets.FileUpload(accept=".pdf,.txt", multiple=False)
question_box = widgets.Textarea(
    placeholder="Type your question here...",
    description="Question:",
    layout=widgets.Layout(width="100%", height="80px")
)
ask_button = widgets.Button(description="Ask", button_style="primary")
output = widgets.Output()

contract_text = ""
loaded_filename = ""

def handle_upload(change):
    global contract_text, loaded_filename
    output.clear_output()
    if not upload.value:
        return
    file_info = next(iter(upload.value.values()))
    loaded_filename = file_info["metadata"]["name"]
    data = file_info["content"]
    contract_text = read_document_bytes(loaded_filename, data)
    with output:
        if contract_text:
            print(f"Loaded: {loaded_filename}")
        else:
            print("Unsupported file or no text extracted.")

def handle_ask(_):
    with output:
        if not contract_text:
            print("Please upload a PDF or TXT file first.")
            return
        question = question_box.value.strip()
        if not question:
            print("Please enter a question.")
            return
        print(find_legal_clause(contract_text, question))
        print()

upload.observe(handle_upload, names="value")
ask_button.on_click(handle_ask)

display(upload, question_box, ask_button, output)

Loading model... please wait.


Device set to use cpu


FileUpload(value={}, accept='.pdf,.txt', description='Upload')

Textarea(value='', description='Question:', layout=Layout(height='80px', width='100%'), placeholder='Type your…

Button(button_style='primary', description='Ask', style=ButtonStyle())

Output()