In [26]:
import fitz  # PyMuPDF
import json
import os
from IPython.display import display, HTML, clear_output
import pandas as pd

In [33]:
class EnhancedNotebookPDFLabeler:
    def __init__(self, output_file="enhanced_training_data.json"):
        self.output_file = output_file
        self.labeled_data = []
        self.current_index = 0
        self.all_spans = []
        
        # Enhanced label system
        self.label_types = {
            0: "BODY_TEXT",
            1: "TITLE", 
            2: "H1",
            3: "H2",
            4: "H3",
            5: "H4"
        }
        
        # Load existing labels if they exist and are valid
        if os.path.exists(output_file):
            try:
                if os.path.getsize(output_file) > 0:
                    with open(output_file, 'r') as f:
                        self.labeled_data = json.load(f)
                    print(f"✅ Loaded {len(self.labeled_data)} existing labels from {output_file}")
                else:
                    print(f"📝 Found empty {output_file}, starting fresh")
            except (json.JSONDecodeError, FileNotFoundError) as e:
                print(f"⚠️ Could not load {output_file}, starting fresh")
                backup_name = f"{output_file}.backup"
                if os.path.exists(output_file):
                    os.rename(output_file, backup_name)
                    print(f"💾 Corrupted file backed up as {backup_name}")
        else:
            print(f"📝 No existing {output_file} found, starting fresh")
        
        self.existing_texts = {item['text'] for item in self.labeled_data}
    
    def load_pdf_spans(self, pdf_path):
        """Extract all text spans from PDF for labeling"""
        if not os.path.exists(pdf_path):
            print(f"❌ Error: PDF file not found at {pdf_path}")
            return False
        
        doc = fitz.open(pdf_path)
        self.all_spans = []
        
        for page_num, page in enumerate(doc):
            blocks = page.get_text("dict")["blocks"]
            for block in blocks:
                if "lines" in block:
                    for line in block["lines"]:
                        for span in line["spans"]:
                            text = span["text"].strip()
                            
                            # Skip short text or already labeled text
                            if len(text) <= 3 or text in self.existing_texts:
                                continue
                            
                            # Store span info for labeling
                            span_info = {
                                "text": text,
                                "page": page_num + 1,
                                "font_size": span["size"],
                                "is_bold": "Bold" in span["font"],
                                "is_italic": "Italic" in span["font"],
                                "x_pos": span["bbox"][0],
                                "raw_span": span
                            }
                            self.all_spans.append(span_info)
        
        doc.close()
        print(f"📄 Loaded {len(self.all_spans)} text spans from {pdf_path}")
        self.current_index = 0
        return True
    
    def display_current_span(self):
        """Display current span for labeling with enhanced options"""
        if self.current_index >= len(self.all_spans):
            print("🎉 All spans have been processed!")
            return False
        
        span = self.all_spans[self.current_index]
        
        # Create HTML display with enhanced labeling options
        html_content = f"""
        <div style="border: 1px solid #ddd; padding: 15px; margin: 10px 0; border-radius: 5px;">
            <h4>Span {self.current_index + 1} of {len(self.all_spans)}</h4>
            <p><strong>Page:</strong> {span['page']}</p>
            <p><strong>Text:</strong> <em>"{span['text']}"</em></p>
            <div style="background-color: #f5f5f5; padding: 10px; border-radius: 3px;">
                <strong>Features:</strong><br>
                • Font Size: {span['font_size']:.2f}<br>
                • Bold: {span['is_bold']}<br>
                • Italic: {span['is_italic']}<br>
                • X Position: {span['x_pos']:.2f}<br>
                • Text Length: {len(span['text'])}
            </div>
            <div style="background-color: #e8f4fd; padding: 10px; border-radius: 3px; margin-top: 10px;">
                <strong>🏷️ Labeling Options:</strong><br>
                • <strong>0:</strong> BODY_TEXT (paragraphs, form fields, regular content)<br>
                • <strong>1:</strong> TITLE (main document title)<br>
                • <strong>2:</strong> H1 (major section headings)<br>
                • <strong>3:</strong> H2 (subsection headings)<br>
                • <strong>4:</strong> H3 (sub-subsection headings)<br>
                • <strong>5:</strong> H4 (minor headings)
            </div>
        </div>
        """
        
        display(HTML(html_content))
        return True
    
    def label_current_span(self, label):
        """Label the current span with enhanced label system"""
        if self.current_index >= len(self.all_spans):
            print("❌ No more spans to label")
            return
        
        span = self.all_spans[self.current_index]
        
        if label not in self.label_types:
            print(f"❌ Invalid label. Use: {list(self.label_types.keys())}")
            return
        
        # Create enhanced feature vector
        features = [
            span["font_size"],
            int(span["is_bold"]),
            int(span["is_italic"]),
            len(span["text"]),
            span["x_pos"],
            int(span["text"][0].isdigit() if span["text"] else 0),
            int(span["text"].istitle()),
            span["text"].count('.'),
            span["page"]  # Added page number as feature
        ]
        
        # Add to labeled data with enhanced structure
        labeled_item = {
            "text": span["text"],
            "features": features,
            "label": label,
            "label_type": self.label_types[label],
            "page": span["page"],
            "font_size": span["font_size"],
            "is_bold": span["is_bold"]
        }
        
        self.labeled_data.append(labeled_item)
        self.existing_texts.add(span["text"])
        
        # Save progress
        self.save_progress()
        
        label_type = self.label_types[label]
        print(f"✅ Labeled as {label_type}: '{span['text']}'")
        
        # Move to next span
        self.current_index += 1
    
    def skip_current_span(self):
        """Skip the current span and move to next"""
        if self.current_index < len(self.all_spans):
            span = self.all_spans[self.current_index]
            print(f"⏭️ Skipped: '{span['text']}'")
            self.current_index += 1
        else:
            print("❌ No more spans to skip")
    
    def save_progress(self):
        """Save current progress to file"""
        try:
            with open(self.output_file, 'w') as f:
                json.dump(self.labeled_data, f, indent=2)
        except Exception as e:
            print(f"⚠️ Error saving progress: {e}")
    
    def get_stats(self):
        """Get enhanced labeling statistics"""
        if not self.labeled_data:
            print("No labeled data yet.")
            return
        
        # Count by label type
        label_counts = {}
        for item in self.labeled_data:
            label_type = item['label_type']
            label_counts[label_type] = label_counts.get(label_type, 0) + 1
        
        total = len(self.labeled_data)
        
        stats_html = f"""
        <div style="background-color: #e8f5e8; padding: 15px; border-radius: 5px; margin: 10px 0;">
            <h4>📊 Enhanced Labeling Statistics</h4>
            <p><strong>Total Labeled:</strong> {total}</p>
            <p><strong>Progress:</strong> {self.current_index}/{len(self.all_spans)} spans processed</p>
            <div style="margin-top: 10px;">
                <strong>Label Distribution:</strong>
                <ul>
        """
        
        for label_type, count in sorted(label_counts.items()):
            percentage = (count / total) * 100
            stats_html += f"<li><strong>{label_type}:</strong> {count} ({percentage:.1f}%)</li>"
        
        stats_html += """
                </ul>
            </div>
        </div>
        """
        display(HTML(stats_html))

# Initialize the enhanced labeler
labeler = EnhancedNotebookPDFLabeler("enhanced_training_data.json")

📝 No existing enhanced_training_data.json found, starting fresh


In [34]:
pdf_file = "sample-5-page-pdf-a4-size.pdf"  # Change this to your PDF filename
success = labeler.load_pdf_spans(pdf_file)

if success:
    print("🚀 Ready to start labeling!")
    print("\nInstructions:")
    print("• Run the next cell to see the current text span")
    print("• Use labeler.label_current_span(1) for HEADINGS")
    print("• Use labeler.label_current_span(0) for BODY TEXT") 
    print("• Use labeler.skip_current_span() to skip")
    print("• Use labeler.get_stats() to see progress")

📄 Loaded 62 text spans from sample-5-page-pdf-a4-size.pdf
🚀 Ready to start labeling!

Instructions:
• Run the next cell to see the current text span
• Use labeler.label_current_span(1) for HEADINGS
• Use labeler.label_current_span(0) for BODY TEXT
• Use labeler.skip_current_span() to skip
• Use labeler.get_stats() to see progress


In [49]:
# SKIP - Run this cell to skip current span
labeler.skip_current_span()
labeler.display_current_span()  # Show next span automatically


⏭️ Skipped: 'Introduction'


True

In [64]:
# Run this cell to see the current span to label
labeler.display_current_span()


True

In [41]:
# TITLE - Run this for main document title
labeler.label_current_span(1)
labeler.display_current_span()

✅ Labeled as TITLE: 'SmartHome Hub'


True

In [99]:
# H1 HEADING - Run this for major section headings
labeler.label_current_span(2)
labeler.display_current_span()


✅ Labeled as H1: 'Next Steps'


True

In [78]:
# H2 HEADING - Run this for subsection headings  
labeler.label_current_span(3)
labeler.display_current_span()


✅ Labeled as H2: 'Company Market Share Key Differentiator'


True

In [None]:
# H3 HEADING - Run this for sub-subsection headings
labeler.label_current_span(4)
labeler.display_current_span()


In [104]:
# BODY TEXT - Run this for regular content
labeler.label_current_span(0)
labeler.display_current_span()


✅ Labeled as BODY_TEXT: 'Prepare for the official launch event.'
🎉 All spans have been processed!


False

In [73]:
# Check your labeling progress
labeler.get_stats()


In [None]:
# View recent labels in a nice table
if labeler.labeled_data:
    recent_labels = labeler.labeled_data[-10:]  # Last 10 labels
    df = pd.DataFrame([{
        'Text': item['text'][:50] + '...' if len(item['text']) > 50 else item['text'],
        'Label': 'HEADING' if item['label'] == 1 else 'BODY TEXT',
        'Page': item['page']
    } for item in recent_labels])
    
    print("📋 Recent Labels:")
    display(df)


In [None]:
# When done with current PDF, load the next one
next_pdf = "file02.pdf"  # Change to your next PDF filename
success = labeler.load_pdf_spans(next_pdf)

if success:
    print(f"📄 Loaded {next_pdf} for labeling")
    labeler.display_current_span()


In [None]:
# Export your completed training data
print(f"💾 Training data saved to: {labeler.output_file}")
print(f"Total labeled examples: {len(labeler.labeled_data)}")

# Optional: Create a summary DataFrame
if labeler.labeled_data:
    summary_df = pd.DataFrame(labeler.labeled_data)
    print("\n📊 Label Distribution:")
    print(summary_df['label'].value_counts().rename({0: 'Body Text', 1: 'Headings'}))
