In [None]:
from google.colab import drive

drive.mount("/content/drive")

# pip install -r requirements.txt first, before continuing with the rest of the code

In [None]:
# Installing necessary packages!
# These should ideally be in requirements.txt and installed once for the environment.
# But for a notebook that's meant to be self-contained for easy sharing/running,
# it's common to keep them here.
# !pip install trafilatura sentence-transformers torch pandas pyarrow duckdb scipy -q
# !pip install fireducks

import sys
import os
import warnings

# Suppress a common warning from the sentence-transformers library
warnings.filterwarnings(
    "ignore", category=FutureWarning, module="huggingface_hub.file_download"
)

# Add the project root to the Python path so we can import from src
# Adjust this path if your notebook is located differently relative to the 'src' folder
# This assumes your project root is '/content/drive/My Drive/WebKnoGraph'
project_root = "/content/drive/My Drive/WebKnoGraph"  # Explicitly set

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root added to sys.path: {project_root}")
print(f"Current working directory: {os.getcwd()}")
print(f"sys.path: {sys.path}")

# Google Colab Drive Mount
try:
    from google.colab import drive

    # Check if already mounted before attempting to mount again
    if not os.path.exists("/content/drive/My Drive"):
        drive.mount("/content/drive/")
        print("Google Drive mounted successfully.")
    else:
        print("Google Drive already mounted.")
except ImportError:
    print("Not running in Google Colab environment. Skipping Google Drive mount.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")

# Import from your refactored backend and shared modules
import gradio as gr
import io
import duckdb

# Specific imports for the Embedding Pipeline
from src.backend.config.embeddings_config import EmbeddingConfig
from src.backend.data.embedding_state_manager import EmbeddingStateManager
from src.backend.data.embeddings_loader import DataLoader
from src.backend.data.embeddings_saver import DataSaver
from src.backend.utils.text_processing import TextExtractor
from src.backend.utils.embedding_generation import EmbeddingGenerator
from src.backend.services.embeddings_service import EmbeddingPipeline
from src.shared.logging_config import (
    ConsoleAndGradioLogger,
)  # Using the updated generic logger

print("All modules imported successfully!")

In [None]:
def run_gradio_interface(input_path: str, output_path: str, batch_size: int):
    """Wires up all components and runs the pipeline, yielding UI updates."""
    log_stream = io.StringIO()
    logger = ConsoleAndGradioLogger(
        log_stream, logger_name="EmbeddingLogger"
    )  # Pass logger_name

    config = EmbeddingConfig(
        input_path=input_path, output_path=output_path, batch_size=batch_size
    )

    # Instantiate all our components
    state_manager = EmbeddingStateManager(config.output_path, logger)
    data_loader = DataLoader(config.input_path, logger)
    text_extractor = TextExtractor()
    embedding_generator = EmbeddingGenerator(config.model_name, logger)
    data_saver = DataSaver(config.output_path, logger)

    pipeline = EmbeddingPipeline(
        config,
        logger,
        state_manager,
        data_loader,
        text_extractor,
        embedding_generator,
        data_saver,
    )

    final_status = "Initializing..."
    for status in pipeline.run():
        final_status = status
        # Yield the current status and the full log content
        yield status, log_stream.getvalue(), ""

    # Generate final summary after the pipeline finishes
    try:
        # Ensure output_glob_path uses forward slashes for DuckDB even on Windows
        output_glob_path = os.path.join(output_path, "*.parquet").replace(os.sep, "/")
        total_embeddings = duckdb.query(
            f"SELECT COUNT(URL) FROM read_parquet('{output_glob_path}')"
        ).fetchone()[0]
        summary_md = f"### ✅ Pipeline Finished\n\n- **Final Status:** {final_status}\n- **Total embeddings saved:** {total_embeddings}\n- **Output location:** `{output_path}`"
    except Exception as e:
        logger.error(f"Could not generate final summary. Error: {e}")
        summary_md = (
            f"### Pipeline Finished\n\n- Could not generate summary. Error: {e}"
        )

    yield final_status, log_stream.getvalue(), summary_md

In [None]:
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🤖 Resumable Embedding Pipeline")
    gr.Markdown(
        "This tool reads HTML from Parquet files, cleans it, generates embeddings, and saves the results in batches. It can be stopped and resumed at any time."
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## 1. Configuration")
            input_path_box = gr.Textbox(
                label="Input Parquet Folder Path", value=EmbeddingConfig.input_path
            )
            output_path_box = gr.Textbox(
                label="Output Embeddings Directory Path",
                value=EmbeddingConfig.output_path,
            )
            batch_size_slider = gr.Slider(
                minimum=10,
                maximum=50,
                value=EmbeddingConfig.batch_size,
                step=10,
                label="Batch Size",
                info="How many pages to process in memory at a time.",
            )
            start_button = gr.Button(
                "🚀 Start/Resume Embedding Generation", variant="primary"
            )

        with gr.Column(scale=2):
            gr.Markdown("## 2. Status & Results")
            status_output = gr.Textbox(label="Current Status", interactive=False)
            log_output = gr.Textbox(
                label="Detailed Logs", interactive=False, lines=10, max_lines=20
            )
            summary_output = gr.Markdown("---")

    start_button.click(
        fn=run_gradio_interface,
        inputs=[input_path_box, output_path_box, batch_size_slider],
        outputs=[status_output, log_output, summary_output],
    )

In [None]:
# --- Launch the Application ---
if __name__ == "__main__":
    try:
        from google.colab import drive

        # It's better to mount once at the very start of the notebook
        # or main.py. If it's already mounted, no need to force_remount unless necessary.
        # Check if already mounted before attempting to mount again (as in embeddings_ui.ipynb Cell 1)
        if not os.path.exists("/content/drive/My Drive"):
            drive.mount("/content/drive/")
            print("Google Drive mounted successfully.")
        else:
            print("Google Drive already mounted.")

        demo.launch(debug=True, share=True)
    except Exception as e:
        print("Could not launch Gradio demo in this environment.")
        print(e)