<a href="https://colab.research.google.com/github/Hassanahmed-15/Shakespeare-Plays-Rag-Pipeline-/blob/main/text_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files

uploaded = files.upload()


Saving newvariorumediti10shak.pdf to newvariorumediti10shak.pdf


In [31]:
import fitz  # PyMuPDF
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
import openai

# --- SET YOUR OPENAI API KEY ---
openai.api_key = ""

# --- INPUT PDF ---
pdf_path = "/content/newvariorumediti10shak.pdf"  # Your PDF file
start_page = 0     # Page numbering starts at 0
end_page = 77      # Page 78 is index 77, inclusive

# --- Function to extract text from PDF ---
def extract_pdf_text(pdf_path, start_page, end_page):
    doc = fitz.open(pdf_path)
    text = ""
    for i in range(start_page, end_page + 1):
        page = doc.load_page(i)
        page_text = page.get_text("text")
        if page_text and page_text.strip():
            text += page_text + "\n"
        else:
            print(f"Warning: Page {i+1} has no extractable text. Skipping...")
    return text

# --- Function to call GPT-5 mini to structure notes ---
def gpt5_structure_notes_exact(text_chunk, act_number=1):
    prompt = f"""
Role: You are an expert editor of pre-1929 Shakespeare Variorum editions. Process only Act 1, Scene 2 with forensic precision.

1. Structural Requirements
text
**ACT 1, SCENE 1**
**=== PLAY TEXT ===**
[content]
**=== SCHOLARLY COMMENTARY ===**
[content]
=== PAGE [number] ===
Preserve:

Original Italian labels (Scena Prima).

All line breaks, indentation, and spacing.

2. OCR Error Correction
Fix these errors (case-sensitive):

OCR Error	Corrected Form	Example
againe → again	"meet again"
raine → rain	"or in rain"
battaile → battle	"battle's lost"
sunne → sun	"set of sun"
foule → foul	"foul is fair"
filthie → filthy	"filthy air"
vpon → upon	"upon the heath"
haue → have	"we have"
Rules:

Never correct period spellings (e.g., "hurley-burley").

Flag uncertain fixes with [OCR?].

3. Line-Number Verification Protocol
For every commentary note:

Find the exact line in Play Text (e.g., 3: → line 3.).

If no match exists:

Add [NO MATCH: Note [X] references "[excerpt]" – No corresponding line in Play Text].

If a partial match exists, note it:

text
[PARTIAL MATCH: Note 12 references "Gray-Malkin" – Closest is line 9]
Cross-check 100% of notes (not random samples).

4. Formatting Rules
Play Text:

text
[line number]. [CHARACTER]: [Dialogue]
*(stage directions)*
Commentary:

text
[line ref]: [SCHOLAR (p.XX)]: [Exact note text]
5. Prohibited Actions
DO NOT:

Modernize syntax/spelling beyond specified OCR fixes.

Summarize, merge, or delete commentary.

Alter original line numbering.

6. Example Output
text
**ACT 1, SCENE 1**
**=== PLAY TEXT ===**
*(Thunder and Lightning. Enter three Witches.)*
1. 1 Witch: When shall we three meet again?
...
9. 1 Witch: I come, Gray-Malkin.

**=== SCHOLARLY COMMENTARY ===**
3: SPALDING (p.102): This first scene is the fag-end...
4: [NO MATCH: Note 4 references "Enter Witches" – No numbered line in Play Text]
12: [PARTIAL MATCH: Note 12 references "Gray-Malkin" – Closest is line 9]
=== PAGE 102 ===
7. Special Instructions
Flag all mismatches – Never guess or force alignments.

Preserve all original punctuation, including italics and brackets.

Input raw OCR text below. Process EXACTLY as specified.


{text_chunk}
"""
    response = openai.chat.completions.create(
        model="gpt-5-mini",
        messages=[{"role": "user", "content": prompt}]
    )
    structured_text = response.choices[0].message.content
    return structured_text

# --- Function to write structured text to DOCX ---
def write_to_docx(structured_text, output_path="Act1_Scenes1_to_3.docx"):
    doc = Document()
    for line in structured_text.split("\n"):
        line = line.strip()
        if not line:
            doc.add_paragraph()
            continue

        # Bold headings
        if line.startswith("**ACT") or line.startswith("**=== "):
            p = doc.add_paragraph()
            run = p.add_run(line.replace("**", ""))
            run.bold = True
            p.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
        else:
            doc.add_paragraph(line)
    doc.save(output_path)
    print(f"DONE: Structured docx saved at {output_path}")

# --- MAIN PROCESS ---
print("Extracting PDF text (pages 1-78)...")
pdf_text = extract_pdf_text(pdf_path, start_page, end_page)

if not pdf_text.strip():
    raise ValueError("No extractable text found in PDF. Check the file or page range.")

print("Structuring notes for Scenes 1-3 with GPT-5 mini...")
structured_text = gpt5_structure_notes_exact(pdf_text, act_number=1)

print("Writing to DOCX file...")
write_to_docx(structured_text, "Act1_Scenes1_to_3.docx")


Extracting PDF text (pages 1-78)...
Structuring notes for Scenes 1-3 with GPT-5 mini...
Writing to DOCX file...
DONE: Structured docx saved at Act1_Scenes1_to_3.docx


In [9]:

!pip install PyMuPDF



In [13]:
from pypdf import PdfReader

PDF_PATH = "/content/newvariorumediti10shak.pdf"
reader = PdfReader(PDF_PATH)

print(f"Total pages: {len(reader.pages)}\n")

for i, page in enumerate(reader.pages[:10]):  # just check first 10 pages
    text = page.extract_text()
    print(f"Page {i+1} has text length: {len(text) if text else 0}")


Total pages: 594

Page 1 has text length: 286
Page 2 has text length: 185
Page 3 has text length: 278
Page 4 has text length: 0
Page 5 has text length: 155
Page 6 has text length: 194
Page 7 has text length: 266
Page 8 has text length: 407
Page 9 has text length: 2560
Page 10 has text length: 3164
