In [2]:
# --- CELL 1: Install Required Libraries ---
!pip install -q requests beautifulsoup4 readability-lxml python-docx googletrans==4.0.0-rc1 ipywidgets

# --- CELL 2: Import Libraries ---
import requests
from bs4 import BeautifulSoup
from readability import Document
from docx import Document as DocxDocument
from docx.shared import Pt
from googletrans import Translator
import ipywidgets as widgets
from IPython.display import display, clear_output
import re
import time
from google.colab import files

# Initialize translator
print("Loading Google Translate...")
translator = Translator()

# --- CELL 3: Extract Passage Function ---
def extract_passage(url):
    """
    Extract main article passage from any website using readability.
    """
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        html = response.text

        # Use readability to extract main content
        doc = Document(html)
        title = doc.title().strip() or "Untitled Article"
        clean_html = doc.summary()

        soup = BeautifulSoup(clean_html, 'html.parser')
        raw_text = soup.get_text(separator='\n', strip=True)

        # Clean text
        passage = re.sub(r'\n+', '\n', raw_text)
        passage = re.sub(r'\s{2,}', ' ', passage).strip()

        return {
            "title": title,
            "passage": passage,
            "word_count": len(passage.split()),
            "success": True
        }
    except Exception as e:
        return {
            "title": "Error",
            "passage": f"Failed to extract: {str(e)}",
            "word_count": 0,
            "success": False
        }

# --- CELL 4: Split into Sentences Function ---
def split_into_sentences(text):
    """
    Split German text into sentences.
    """
    # Split on sentence-ending punctuation followed by space and capital letter (German-specific)
    sentences = re.split(r'(?<=[.!?])\s+(?=[A-ZÄÖÜa-zäöü])', text)
    return [s.strip() for s in sentences if s.strip() and len(s) > 5]  # Filter short fragments

# --- CELL 5: Translate Sentences Function ---
def translate_sentences(sentences):
    """
    Translate each German sentence to English.
    """
    english_sentences = []
    for sentence in sentences:
        try:
            translated = translator.translate(sentence, src='de', dest='en').text
        except Exception as e:
            translated = f"[Translation failed: {e}]"
        english_sentences.append(translated)
        time.sleep(0.5)  # Rate limit
    return english_sentences

# --- CELL 6: Create Word Table Function ---
def create_bilingual_table(title, german_sentences, english_sentences, output_filename):
    """
    Create .docx with 2-column table: Left=German, Right=English.
    """
    doc = DocxDocument()

    # Title and metadata
    doc.add_heading(title, 0)
    doc.add_paragraph(f"Source URL: {time.strftime('%Y-%m-%d %H:%M')}")  # Placeholder for URL
    doc.add_page_break()

    # Create table
    table = doc.add_table(rows=0, cols=2, style='Table Grid')
    table.autofit = True
    table.columns[0].width = Pt(300)  # Adjust widths as needed
    table.columns[1].width = Pt(300)

    # Headers
    hdr_cells = table.add_row().cells
    hdr_cells[0].text = "German Sentence"
    hdr_cells[1].text = "English Translation"

    # Bold headers
    for cell in hdr_cells:
        for paragraph in cell.paragraphs:
            for run in paragraph.runs:
                run.bold = True
                run.font.size = Pt(12)

    # Add sentence pairs
    for german, english in zip(german_sentences, english_sentences):
        row_cells = table.add_row().cells
        row_cells[0].text = german
        row_cells[1].text = english

        # Optional: Style English (e.g., italic)
        for paragraph in row_cells[1].paragraphs:
            for run in paragraph.runs:
                run.italic = True
                run.font.size = Pt(11)

    # Save
    doc.save(output_filename)
    return output_filename

# --- CELL 7: Main Workflow Function ---
def process_website_to_bilingual_doc(url):
    """
    Full pipeline: Extract → Split → Translate → Create Table → Download.
    """
    # Step 1: Extract passage
    print("Extracting passage from website...")
    result = extract_passage(url)
    if not result["success"]:
        print(result["passage"])
        return

    passage = result["passage"]
    title = result["title"]
    print(f"Extracted: {result['word_count']} words from '{title}'")

    # Step 2: Split into sentences
    german_sentences = split_into_sentences(passage)
    if not german_sentences:
        print("No sentences found.")
        return
    print(f"Split into {len(german_sentences)} sentences.")

    # Step 3: Translate
    print("Translating sentences...")
    english_sentences = translate_sentences(german_sentences)

    # Step 4: Create and save Word file
    output_filename = f"Bilingual_Table_{re.sub(r'[^\w\-]', '_', title[:50])}.docx"
    create_bilingual_table(title, german_sentences, english_sentences, output_filename)

    print(f"\nSuccess! Created '{output_filename}'")
    print("Table format: Left column = German, Right column = English")
    files.download(output_filename)

# --- CELL 8: Interactive UI ---
url_input = widgets.Text(
    value='https://de.yahoo.com/nachrichten/outlander-geht-finale-runde-115657087.html',
    placeholder='Enter website URL with German text',
    description='URL:',
    layout=widgets.Layout(width='100%')
)

process_btn = widgets.Button(
    description='Extract Passage → Translate Sentence-by-Sentence → Create Word Table',
    button_style='success',
    layout=widgets.Layout(width='100%', height='50px')
)

output = widgets.Output()

def on_process_click(b):
    with output:
        clear_output()
        url = url_input.value.strip()
        if not url:
            print("Please enter a URL.")
            return
        process_website_to_bilingual_doc(url)

process_btn.on_click(on_process_click)

# --- CELL 9: Display UI ---
display(url_input, process_btn, output)

Loading Google Translate...


Text(value='https://de.yahoo.com/nachrichten/outlander-geht-finale-runde-115657087.html', description='URL:', …

Button(button_style='success', description='Extract Passage → Translate Sentence-by-Sentence → Create Word Tab…

Output()