In [None]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

# pip install -r requirements.txt first, before continuing with the rest of the code

In [None]:
# Installing necessary packages!
# These should ideally be in requirements.txt and installed once for the environment.
# But for a notebook that's meant to be self-contained for easy sharing/running,
# it's common to keep them here.
# !pip install trafilatura sentence-transformers torch pandas pyarrow duckdb scipy -q

import sys
import os
import warnings

# Suppress a common warning from the sentence-transformers library
warnings.filterwarnings(
    "ignore", category=FutureWarning, module="huggingface_hub.file_download"
)

# Add the project root to the Python path so we can import from src
# Adjust this path if your notebook is located differently relative to the 'src' folder
# This assumes your project root is '/content/drive/My Drive/WebKnoGraph'
project_root = "/content/drive/My Drive/WebKnoGraph"  # Explicitly set

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root added to sys.path: {project_root}")
print(f"Current working directory: {os.getcwd()}")
print(f"sys.path: {sys.path}")

# Google Colab Drive Mount
try:
    from google.colab import drive

    # Check if already mounted before attempting to mount again (as in embeddings_ui.ipynb Cell 1)
    if not os.path.exists("/content/drive/My Drive"):
        drive.mount("/content/drive/")
        print("Google Drive mounted successfully.")
    else:
        print("Google Drive already mounted.")
except ImportError:
    print("Not running in Google Colab environment. Skipping Google Drive mount.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")

# Import from your refactored backend and shared modules
import gradio as gr
import io
import duckdb
import pandas as pd  # Added pandas import
from tqdm.auto import tqdm  # Added tqdm import for progress bar
import traceback  # Added traceback for error logging

# Specific imports for the Embedding Pipeline
from src.backend.config.embeddings_config import EmbeddingConfig
from src.backend.data.embedding_state_manager import EmbeddingStateManager
from src.backend.data.embeddings_loader import DataLoader
from src.backend.data.embeddings_saver import DataSaver
from src.backend.utils.text_processing import TextExtractor
from src.backend.utils.embedding_generation import EmbeddingGenerator
from src.backend.services.embeddings_service import EmbeddingPipeline
from src.shared.logging_config import (
    ConsoleAndGradioLogger,
)  # Using the updated generic logger

print("All modules imported successfully!")

In [None]:
# File: embeddings_ui.ipynb - Cell 3
def run_gradio_interface(input_path: str, output_path: str, batch_size: int):
    """Wires up all components and runs the pipeline, yielding UI updates."""
    log_stream = io.StringIO()
    logger = ConsoleAndGradioLogger(
        log_stream, logger_name="EmbeddingLogger"
    )  # Pass logger_name

    config = EmbeddingConfig(
        input_path=input_path, output_path=output_path, batch_size=batch_size
    )

    # Instantiate all our components
    state_manager = EmbeddingStateManager(config.output_path, logger)
    data_loader = DataLoader(config.input_path, logger)
    text_extractor = TextExtractor()
    embedding_generator = EmbeddingGenerator(config.model_name, logger)
    data_saver = DataSaver(config.output_path, logger)

    # Create a modified EmbeddingPipeline class within this scope that includes the fix
    # Alternatively, you would apply this fix directly in src/backend/services/embeddings_service.py
    class FixedEmbeddingPipeline(EmbeddingPipeline):
        def run(self):
            """
            Orchestrates the embedding generation pipeline.
            Yields status updates for Gradio UI.
            """
            self.logger.info("Starting embedding pipeline...")
            yield "Status: Initializing Pipeline..."

            try:
                processed_urls = self.state_manager.get_processed_urls()
                self.logger.info(
                    f"Found {len(processed_urls)} URLs that have already been processed. They will be skipped."
                )
                yield f"Status: Resuming, skipping {len(processed_urls)} already processed URLs."

                self.logger.info("Querying for new pages to process...")
                data_iterator = self.data_loader.stream_unprocessed_data(
                    processed_urls=processed_urls, batch_size=self.config.batch_size
                )
                yield "Status: Loading new data..."

                total_processed_in_session = 0
                for batch_num, df_batch_arrow in enumerate(data_iterator):
                    if df_batch_arrow.num_rows == 0:
                        self.logger.info(f"Batch {batch_num + 1} is empty, skipping.")
                        yield f"Status: Processed Batch {batch_num + 1}: Empty."
                        continue

                    self.logger.info(
                        f"Processing Batch {batch_num + 1} ({len(df_batch_arrow)} pages)..."
                    )

                    # Convert PyArrow RecordBatch to Pandas DataFrame for modification
                    df_batch = df_batch_arrow.to_pandas()

                    # Extract clean text
                    df_batch["clean_text"] = [
                        self.text_extractor.extract(
                            html_content
                        )  # Changed from .extract_text to .extract
                        for html_content in tqdm(
                            df_batch["Content"],  # Use "Content" as per DataLoader
                            desc="Extracting Text",
                            leave=False,
                            unit="docs",
                        )
                    ]

                    # Filter out pages where text extraction might have failed or resulted in empty strings
                    original_count = len(df_batch)
                    df_batch = df_batch[df_batch["clean_text"].str.strip().astype(bool)]
                    filtered_count = original_count - len(df_batch)
                    if filtered_count > 0:
                        self.logger.warning(
                            f"Filtered out {filtered_count} pages with no extractable text in Batch {batch_num + 1}."
                        )

                    if df_batch.empty:  # This .empty check is correct for the Pandas DataFrame after conversion
                        self.logger.warning(
                            f"Batch {batch_num + 1} resulted in no extractable text after filtering, skipping."
                        )
                        yield f"Status: Processed Batch {batch_num + 1}: No valid text extracted."
                        continue

                    # Generate embeddings
                    self.logger.info(
                        f"Generating Embeddings for Batch {batch_num + 1}..."
                    )
                    try:
                        # Corrected: Use the actual method name 'generate' from EmbeddingGenerator
                        df_batch["embedding"] = self.embedding_generator.generate(
                            df_batch["clean_text"].tolist()
                        ).tolist()
                    except Exception as e:
                        # Do not pass exc_info to ConsoleAndGradioLogger.error()
                        # Instead, include the traceback in the message.
                        error_message = f"Error generating embeddings for Batch {batch_num + 1}: {e}\n{traceback.format_exc()}"
                        self.logger.error(error_message)
                        continue

                    # --- ADDED SAVING LOGIC HERE ---
                    # Save the generated embeddings after each batch
                    self.data_saver.save_embeddings_batch(df_batch)  # Added this line
                    # --- END OF ADDED LOGIC ---

                    # The update_processed_urls method is not in the provided EmbeddingStateManager.
                    # You will need to implement an 'update_processed_urls' method in
                    # src/backend/data/embedding_state_manager.py if you intend to save the state.
                    # For now, this line is commented out.
                    # If you need resume functionality, ensure this method is implemented in EmbeddingStateManager:
                    # def update_processed_urls(self, new_urls: list):
                    #     """
                    #     Updates the persistent record of processed URLs.
                    #     This method should append or merge `new_urls` with the existing state
                    #     and save it to a durable storage (e.g., a JSON file or a dedicated DuckDB table).
                    #     """
                    #     pass # Placeholder for actual implementation in embedding_state_manager.py

                    # self.state_manager.update_processed_urls(df_batch["URL"].tolist())
                    total_processed_in_session += len(df_batch)

                    yield f"Status: Processed Batch {batch_num + 1} ({len(df_batch)} embeddings generated). Total in session: {total_processed_in_session}"

                self.logger.info("Embedding pipeline finished successfully.")
                yield "Status: Pipeline Finished Successfully!"

            except Exception as e:
                error_message = (
                    f"A critical pipeline error occurred: {e}\n{traceback.format_exc()}"
                )
                self.logger.error(error_message)
                yield f"Status: Critical Error! Check logs. Error: {e}"
                raise

    pipeline = FixedEmbeddingPipeline(  # Use the fixed pipeline
        config,
        logger,
        state_manager,
        data_loader,
        text_extractor,
        embedding_generator,
        data_saver,
    )

    final_status = "Initializing..."
    for status in pipeline.run():
        final_status = status
        # Yield the current status and the full log content
        yield status, log_stream.getvalue(), ""

    # Generate final summary after the pipeline finishes
    try:
        # Ensure output_glob_path uses forward slashes for DuckDB even on Windows
        output_glob_path = os.path.join(output_path, "*.parquet").replace(os.sep, "/")
        total_embeddings = duckdb.query(
            f"SELECT COUNT(URL) FROM read_parquet('{output_glob_path}')"
        ).fetchone()[0]
        summary_md = f"### ✅ Pipeline Finished\n\n- **Final Status:** {final_status}\n- **Total embeddings saved:** {total_embeddings}\n- **Output location:** `{output_path}`"
    except Exception as e:
        logger.error(f"Could not generate final summary. Error: {e}")
        summary_md = (
            f"### Pipeline Finished\n\n- Could not generate summary. Error: {e}"
        )

    yield final_status, log_stream.getvalue(), summary_md

In [None]:
# File: embeddings_ui.ipynb - Cell 4
# Build the Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🤖 Resumable Embedding Pipeline")
    gr.Markdown(
        "This tool reads HTML from Parquet files, cleans it, generates embeddings, and saves the results in batches. It can be stopped and resumed at any time."
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## 1. Configuration")
            input_path_box = gr.Textbox(
                label="Input Parquet Folder Path", value=EmbeddingConfig.input_path
            )
            output_path_box = gr.Textbox(
                label="Output Embeddings Directory Path",
                value=EmbeddingConfig.output_path,
            )
            batch_size_input = gr.Number(
                minimum=1,
                maximum=5,
                value=EmbeddingConfig.batch_size,
                step=1,
                label="Batch Size",
                info="How many pages to process in memory at a time.",
            )
            start_button = gr.Button(
                "🚀 Start/Resume Embedding Generation", variant="primary"
            )

        with gr.Column(scale=2):
            gr.Markdown("## 2. Status & Results")
            status_output = gr.Textbox(label="Current Status", interactive=False)
            log_output = gr.Textbox(
                label="Detailed Logs", interactive=False, lines=10, max_lines=20
            )
            summary_output = gr.Markdown("---")

    start_button.click(
        fn=run_gradio_interface,
        inputs=[input_path_box, output_path_box, batch_size_input],
        outputs=[status_output, log_output, summary_output],
    )

In [None]:
# File: embeddings_ui.ipynb - Cell 5
# --- Launch the Application ---
if __name__ == "__main__":
    try:
        from google.colab import drive

        # It's better to mount once at the very start of the notebook
        # or main.py. If it's already mounted, no need to force_remount unless necessary.
        # Check if already mounted before attempting to mount again (as in embeddings_ui.ipynb Cell 1)
        if not os.path.exists("/content/drive/My Drive"):
            drive.mount("/content/drive/")
            print("Google Drive mounted successfully.")
        else:
            print("Google Drive already mounted.")

        demo.launch(debug=True, share=True)
    except Exception as e:
        print("Could not launch Gradio demo in this environment.")
        print(e)