In [1]:
!pip install trafilatura sentence-transformers torch pandas pyarrow duckdb scipy



In [1]:
#
# --- Step 1: Install Required Libraries ---
#
!pip install -q gradio trafilatura sentence-transformers torch pandas pyarrow duckdb beautifulsoup4 lxml

#
# --- Step 2: Import Libraries ---
#
import gradio as gr
import duckdb
import pandas as pd
import numpy as np
import os
import re
import io
import logging
import time
import trafilatura
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from abc import ABC, abstractmethod
from dataclasses import dataclass
import warnings

# Suppress a common warning from the sentence-transformers library
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub.file_download")


#
# --- Step 3: Configuration & Core Interfaces ---
#

@dataclass
class EmbeddingConfig:
    """Holds all configuration settings for the pipeline."""
    input_path: str = "/content/drive/My Drive/master_july_2025/data/crawled_data_parquet/"
    output_path: str = "/content/drive/My Drive/master_july_2025/data/url_embeddings/"
    model_name: str = 'all-MiniLM-L6-v2' # should be changed to other/multilingual content if the content is not in English
    batch_size: int = 10

class ILogger(ABC):
    """Interface for logging messages."""
    @abstractmethod
    def info(self, message: str): pass
    @abstractmethod
    def error(self, message: str): pass
    @abstractmethod
    def exception(self, message: str): pass

class ConsoleAndGradioLogger(ILogger):
    """Logs messages to the console and a Gradio UI component."""
    def __init__(self, log_output_stream: io.StringIO, level=logging.INFO):
        self._logger = logging.getLogger("EmbeddingLogger")
        self._logger.setLevel(level)
        if self._logger.hasHandlers():
            self._logger.handlers.clear()

        # Console handler
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        self._logger.addHandler(console_handler)

        # Gradio handler
        gradio_handler = logging.StreamHandler(log_output_stream)
        gradio_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        self._logger.addHandler(gradio_handler)

    def info(self, message: str): self._logger.info(message)
    def error(self, message: str): self._logger.error(message)
    def exception(self, message: str): self._logger.exception(message)

#
# --- Step 4: Component Classes (Single Responsibility Principle) ---
#

class EmbeddingStateManager:
    """Manages the state of the embedding process, enabling resumes."""
    def __init__(self, output_path: str, logger: ILogger):
        self.output_path = output_path
        self.logger = logger

    def get_processed_urls(self) -> set:
        """Scans the output directory to find URLs that have already been embedded."""
        processed_urls = set()
        if not os.path.exists(self.output_path):
            os.makedirs(self.output_path)
            self.logger.info("Output directory created.")
            return processed_urls

        try:
            output_glob_path = os.path.join(self.output_path, '*.parquet')
            # Use DuckDB for efficient scanning of existing results
            processed_df = duckdb.query(f"SELECT DISTINCT URL FROM read_parquet('{output_glob_path}')").to_df()
            processed_urls = set(processed_df['URL'])
            if processed_urls:
                self.logger.info(f"Found {len(processed_urls)} URLs that have already been processed. They will be skipped.")
        except Exception:
            self.logger.info("No previously processed embeddings found. Starting fresh.")
        return processed_urls

class DataLoader:
    """Responsible for loading unprocessed data in batches."""
    def __init__(self, input_path: str, logger: ILogger):
        self.input_path = input_path
        self.logger = logger
        self.con = duckdb.connect()

    def stream_unprocessed_data(self, processed_urls: set, batch_size: int):
        """A generator that yields batches of new data to be processed."""
        input_glob_path = os.path.join(self.input_path, '**', '*.parquet')
        base_query = f"SELECT URL, Content FROM read_parquet('{input_glob_path}') WHERE Status_Code >= 200 AND Status_Code < 300 AND Content IS NOT NULL AND Content != ''"

        if processed_urls:
            processed_urls_df = pd.DataFrame(list(processed_urls), columns=['URL'])

            # --- THIS IS THE FIX ---
            # We replace the non-standard "LEFT ANTI JOIN" with a standard
            # "LEFT JOIN" and a "WHERE ... IS NULL" check. This achieves the same goal.
            final_query = f"""
                SELECT t1.URL, t1.Content
                FROM ({base_query}) AS t1
                LEFT JOIN processed_urls_df AS t2 ON t1.URL = t2.URL
                WHERE t2.URL IS NULL
            """
            # --- END OF FIX ---
        else:
            final_query = base_query

        self.logger.info("Querying for new pages to process...")
        try:
            # Use fetch_record_batch for memory-efficient iteration
            for batch in self.con.execute(final_query).fetch_record_batch(batch_size):
                yield batch.to_pandas()
        except Exception as e:
            self.logger.error(f"Could not query Parquet files. Please check the input path: {e}")
            return

class TextExtractor:
    """Extracts clean text from raw HTML."""
    def extract(self, html_content: str) -> str:
        if not html_content or not isinstance(html_content, str):
            return ""
        text = trafilatura.extract(html_content, include_comments=False, include_tables=False, deduplicate=True)
        if text:
            text = re.sub(r'\n\s*\n', '\n\n', text)
            return text.strip()
        return ""

class EmbeddingGenerator:
    """Generates embeddings for a list of texts."""
    def __init__(self, model_name: str, logger: ILogger):
        self.logger = logger
        self.logger.info(f"Loading embedding model: {model_name}...")
        self.model = SentenceTransformer(model_name)
        self.logger.info("Model loaded successfully.")

    def generate(self, texts: list[str]) -> np.ndarray:
        self.logger.info(f"Generating embeddings for {len(texts)} texts...")
        return self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

class DataSaver:
    """Saves a batch of embeddings to a Parquet file."""
    def __init__(self, output_path: str, logger: ILogger):
        self.output_path = output_path
        self.logger = logger

    def save_batch(self, df_batch: pd.DataFrame, batch_num: int):
        """Saves a DataFrame of URLs and embeddings to a uniquely named file."""
        batch_filename = f"embeddings_batch_{int(time.time())}_{batch_num}.parquet"
        batch_output_path = os.path.join(self.output_path, batch_filename)
        df_batch.to_parquet(batch_output_path, index=False)
        self.logger.info(f"✅ Saved batch {batch_num} to {batch_filename}")

#
# --- Step 5: The Main Pipeline Orchestrator ---
#

class EmbeddingPipeline:
    """Orchestrates the entire embedding generation process."""
    def __init__(self, config: EmbeddingConfig, logger: ILogger, state_manager: EmbeddingStateManager,
                 data_loader: DataLoader, text_extractor: TextExtractor,
                 embedding_generator: EmbeddingGenerator, data_saver: DataSaver):
        self.config = config
        self.logger = logger
        self.state_manager = state_manager
        self.data_loader = data_loader
        self.text_extractor = text_extractor
        self.embedding_generator = embedding_generator
        self.data_saver = data_saver

    def run(self):
        """A generator that executes the pipeline and yields status updates."""
        try:
            yield "Initializing..."
            processed_urls = self.state_manager.get_processed_urls()

            yield "Loading model and querying data..."
            data_stream = self.data_loader.stream_unprocessed_data(processed_urls, self.config.batch_size)

            batch_num = 1
            processed_in_this_session = False
            for df_batch in data_stream:
                processed_in_this_session = True
                status_msg = f"Processing Batch {batch_num} ({len(df_batch)} pages)..."
                self.logger.info(status_msg)
                yield status_msg

                # Extract Text
                df_batch['clean_text'] = [self.text_extractor.extract(html) for html in tqdm(df_batch['Content'], desc="Extracting Text")]
                df_batch = df_batch[df_batch['clean_text'].str.len() > 100]

                if df_batch.empty:
                    self.logger.info("Batch had no pages with sufficient text after cleaning.")
                    continue

                # Generate Embeddings
                embeddings = self.embedding_generator.generate(df_batch['clean_text'].tolist())

                # Save Batch
                output_df = pd.DataFrame({'URL': df_batch['URL'], 'Embedding': [e.tolist() for e in embeddings]})
                self.data_saver.save_batch(output_df, batch_num)
                batch_num += 1

            if not processed_in_this_session:
                self.logger.info("No new pages to process. The dataset is already up to date.")
                yield "Already up to date."
            else:
                self.logger.info("All new batches processed successfully.")
                yield "Finished"

        except Exception as e:
            self.logger.exception(f"A critical pipeline error occurred: {e}")
            yield f"Error: {e}"


#
# --- Step 6: Gradio UI and Main Execution Logic ---
#

def run_gradio_interface(input_path: str, output_path: str, batch_size: int):
    """Wires up all components and runs the pipeline, yielding UI updates."""
    log_stream = io.StringIO()
    logger = ConsoleAndGradioLogger(log_stream)

    config = EmbeddingConfig(input_path=input_path, output_path=output_path, batch_size=batch_size)

    # Instantiate all our components
    state_manager = EmbeddingStateManager(config.output_path, logger)
    data_loader = DataLoader(config.input_path, logger)
    text_extractor = TextExtractor()
    embedding_generator = EmbeddingGenerator(config.model_name, logger)
    data_saver = DataSaver(config.output_path, logger)

    pipeline = EmbeddingPipeline(config, logger, state_manager, data_loader, text_extractor, embedding_generator, data_saver)

    final_status = "Initializing..."
    for status in pipeline.run():
        final_status = status
        # Yield the current status and the full log content
        yield status, log_stream.getvalue(), ""

    # Generate final summary after the pipeline finishes
    try:
        output_glob_path = os.path.join(output_path, '*.parquet')
        total_embeddings = duckdb.query(f"SELECT COUNT(URL) FROM read_parquet('{output_glob_path}')").fetchone()[0]
        summary_md = f"### ✅ Pipeline Finished\n\n- **Final Status:** {final_status}\n- **Total embeddings saved:** {total_embeddings}\n- **Output location:** `{output_path}`"
    except Exception as e:
        summary_md = f"### Pipeline Finished\n\n- Could not generate summary. Error: {e}"

    yield final_status, log_stream.getvalue(), summary_md


with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🤖 Resumable Embedding Pipeline")
    gr.Markdown("This tool reads HTML from Parquet files, cleans it, generates embeddings, and saves the results in batches. It can be stopped and resumed at any time.")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## 1. Configuration")
            input_path_box = gr.Textbox(
                label="Input Parquet Folder Path",
                value=EmbeddingConfig.input_path
            )
            output_path_box = gr.Textbox(
                label="Output Embeddings Directory Path",
                value=EmbeddingConfig.output_path
            )
            batch_size_slider = gr.Slider(
                minimum=10, maximum=50, value=EmbeddingConfig.batch_size, step=10,
                label="Batch Size",
                info="How many pages to process in memory at a time."
            )
            start_button = gr.Button("🚀 Start/Resume Embedding Generation", variant="primary")

        with gr.Column(scale=2):
            gr.Markdown("## 2. Status & Results")
            status_output = gr.Textbox(label="Current Status", interactive=False)
            log_output = gr.Textbox(label="Detailed Logs", interactive=False, lines=10, max_lines=20)
            summary_output = gr.Markdown("---")

    start_button.click(
        fn=run_gradio_interface,
        inputs=[input_path_box, output_path_box, batch_size_slider],
        outputs=[status_output, log_output, summary_output]
    )

#
# --- Launch the Application ---
#
if __name__ == '__main__':
    try:
        from google.colab import drive
        drive.mount('/content/drive/', force_remount=True)
        demo.launch(debug=True, share=True)
    except Exception as e:
        print("Could not launch Gradio demo in this environment.")
        print(e)

Mounted at /content/drive/
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://d7b296b1fb8709aeb8.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


2025-06-15 18:48:48,802 - INFO - Loading embedding model: all-MiniLM-L6-v2...
INFO:EmbeddingLogger:Loading embedding model: all-MiniLM-L6-v2...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
2025-06-15 18:48:52,932 - INFO - Model loaded successfully.
INFO:EmbeddingLogger:Model loaded successfully.
2025-06-15 18:48:53,050 - INFO - Found 100 URLs that have already been processed. They will be skipped.
INFO:EmbeddingLogger:Found 100 URLs that have already been processed. They will be skipped.
2025-06-15 18:48:53,057 - INFO - Querying for new pages to process...
INFO:EmbeddingLogger:Querying for new pages to process...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

2025-06-15 18:48:55,989 - INFO - Processing Batch 1 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 1 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  9.78it/s]
2025-06-15 18:48:57,027 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:48:59,238 - INFO - ✅ Saved batch 1 to embeddings_batch_1750013339_1.parquet
INFO:EmbeddingLogger:✅ Saved batch 1 to embeddings_batch_1750013339_1.parquet
2025-06-15 18:48:59,259 - INFO - Processing Batch 2 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 2 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 12.03it/s]
2025-06-15 18:49:00,105 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:01,460 - INFO - ✅ Saved batch 2 to embeddings_batch_1750013341_2.parquet
INFO:EmbeddingLogger:✅ Saved batch 2 to embeddings_batch_1750013341_2.parquet
2025-06-15 18:49:01,474 - INFO - Processing Batch 3 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 3 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 18.76it/s]
2025-06-15 18:49:02,018 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:02,866 - INFO - ✅ Saved batch 3 to embeddings_batch_1750013342_3.parquet
INFO:EmbeddingLogger:✅ Saved batch 3 to embeddings_batch_1750013342_3.parquet
2025-06-15 18:49:02,879 - INFO - Processing Batch 4 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 4 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.83it/s]
2025-06-15 18:49:03,523 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:04,654 - INFO - ✅ Saved batch 4 to embeddings_batch_1750013344_4.parquet
INFO:EmbeddingLogger:✅ Saved batch 4 to embeddings_batch_1750013344_4.parquet
2025-06-15 18:49:04,680 - INFO - Processing Batch 5 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 5 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 17.22it/s]
2025-06-15 18:49:05,270 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:06,349 - INFO - ✅ Saved batch 5 to embeddings_batch_1750013346_5.parquet
INFO:EmbeddingLogger:✅ Saved batch 5 to embeddings_batch_1750013346_5.parquet
2025-06-15 18:49:06,365 - INFO - Processing Batch 6 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 6 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.25it/s]
2025-06-15 18:49:06,992 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:08,068 - INFO - ✅ Saved batch 6 to embeddings_batch_1750013348_6.parquet
INFO:EmbeddingLogger:✅ Saved batch 6 to embeddings_batch_1750013348_6.parquet
2025-06-15 18:49:08,083 - INFO - Processing Batch 7 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 7 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.03it/s]
2025-06-15 18:49:08,717 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:09,894 - INFO - ✅ Saved batch 7 to embeddings_batch_1750013349_7.parquet
INFO:EmbeddingLogger:✅ Saved batch 7 to embeddings_batch_1750013349_7.parquet
2025-06-15 18:49:09,915 - INFO - Processing Batch 8 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 8 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  8.51it/s]
2025-06-15 18:49:11,105 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:13,189 - INFO - ✅ Saved batch 8 to embeddings_batch_1750013353_8.parquet
INFO:EmbeddingLogger:✅ Saved batch 8 to embeddings_batch_1750013353_8.parquet
2025-06-15 18:49:13,205 - INFO - Processing Batch 9 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 9 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  9.82it/s]
2025-06-15 18:49:14,237 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:15,191 - INFO - ✅ Saved batch 9 to embeddings_batch_1750013355_9.parquet
INFO:EmbeddingLogger:✅ Saved batch 9 to embeddings_batch_1750013355_9.parquet
2025-06-15 18:49:15,204 - INFO - Processing Batch 10 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 10 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 17.45it/s]
2025-06-15 18:49:15,790 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:16,923 - INFO - ✅ Saved batch 10 to embeddings_batch_1750013356_10.parquet
INFO:EmbeddingLogger:✅ Saved batch 10 to embeddings_batch_1750013356_10.parquet
2025-06-15 18:49:16,932 - INFO - Processing Batch 11 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 11 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 17.55it/s]
2025-06-15 18:49:17,516 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:18,357 - INFO - ✅ Saved batch 11 to embeddings_batch_1750013358_11.parquet
INFO:EmbeddingLogger:✅ Saved batch 11 to embeddings_batch_1750013358_11.parquet
2025-06-15 18:49:18,366 - INFO - Processing Batch 12 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 12 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 18.24it/s]
2025-06-15 18:49:18,926 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:19,970 - INFO - ✅ Saved batch 12 to embeddings_batch_1750013359_12.parquet
INFO:EmbeddingLogger:✅ Saved batch 12 to embeddings_batch_1750013359_12.parquet
2025-06-15 18:49:19,979 - INFO - Processing Batch 13 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 13 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 17.78it/s]
2025-06-15 18:49:20,550 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:21,531 - INFO - ✅ Saved batch 13 to embeddings_batch_1750013361_13.parquet
INFO:EmbeddingLogger:✅ Saved batch 13 to embeddings_batch_1750013361_13.parquet
2025-06-15 18:49:21,543 - INFO - Processing Batch 14 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 14 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.74it/s]
2025-06-15 18:49:22,191 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:23,142 - INFO - ✅ Saved batch 14 to embeddings_batch_1750013363_14.parquet
INFO:EmbeddingLogger:✅ Saved batch 14 to embeddings_batch_1750013363_14.parquet
2025-06-15 18:49:23,152 - INFO - Processing Batch 15 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 15 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  7.17it/s]
2025-06-15 18:49:24,564 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:26,299 - INFO - ✅ Saved batch 15 to embeddings_batch_1750013366_15.parquet
INFO:EmbeddingLogger:✅ Saved batch 15 to embeddings_batch_1750013366_15.parquet
2025-06-15 18:49:26,311 - INFO - Processing Batch 16 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 16 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 10.02it/s]
2025-06-15 18:49:27,322 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:28,775 - INFO - ✅ Saved batch 16 to embeddings_batch_1750013368_16.parquet
INFO:EmbeddingLogger:✅ Saved batch 16 to embeddings_batch_1750013368_16.parquet
2025-06-15 18:49:28,792 - INFO - Processing Batch 17 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 17 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 13.54it/s]
2025-06-15 18:49:29,548 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:30,687 - INFO - ✅ Saved batch 17 to embeddings_batch_1750013370_17.parquet
INFO:EmbeddingLogger:✅ Saved batch 17 to embeddings_batch_1750013370_17.parquet
2025-06-15 18:49:30,699 - INFO - Processing Batch 18 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 18 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.71it/s]
2025-06-15 18:49:31,392 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:32,258 - INFO - ✅ Saved batch 18 to embeddings_batch_1750013372_18.parquet
INFO:EmbeddingLogger:✅ Saved batch 18 to embeddings_batch_1750013372_18.parquet
2025-06-15 18:49:32,268 - INFO - Processing Batch 19 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 19 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.30it/s]
2025-06-15 18:49:32,892 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:33,977 - INFO - ✅ Saved batch 19 to embeddings_batch_1750013373_19.parquet
INFO:EmbeddingLogger:✅ Saved batch 19 to embeddings_batch_1750013373_19.parquet
2025-06-15 18:49:33,987 - INFO - Processing Batch 20 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 20 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.32it/s]
2025-06-15 18:49:34,614 - INFO - Generating embeddings for 6 texts...
INFO:EmbeddingLogger:Generating embeddings for 6 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:35,848 - INFO - ✅ Saved batch 20 to embeddings_batch_1750013375_20.parquet
INFO:EmbeddingLogger:✅ Saved batch 20 to embeddings_batch_1750013375_20.parquet
2025-06-15 18:49:35,860 - INFO - Processing Batch 21 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 21 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  8.53it/s]
2025-06-15 18:49:37,043 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:38,831 - INFO - ✅ Saved batch 21 to embeddings_batch_1750013378_21.parquet
INFO:EmbeddingLogger:✅ Saved batch 21 to embeddings_batch_1750013378_21.parquet
2025-06-15 18:49:38,843 - INFO - Processing Batch 22 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 22 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 10.11it/s]
2025-06-15 18:49:39,842 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:41,671 - INFO - ✅ Saved batch 22 to embeddings_batch_1750013381_22.parquet
INFO:EmbeddingLogger:✅ Saved batch 22 to embeddings_batch_1750013381_22.parquet
2025-06-15 18:49:41,690 - INFO - Processing Batch 23 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 23 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  9.47it/s]
2025-06-15 18:49:42,758 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:43,851 - INFO - ✅ Saved batch 23 to embeddings_batch_1750013383_23.parquet
INFO:EmbeddingLogger:✅ Saved batch 23 to embeddings_batch_1750013383_23.parquet
2025-06-15 18:49:43,864 - INFO - Processing Batch 24 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 24 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.00it/s]
2025-06-15 18:49:44,543 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:45,527 - INFO - ✅ Saved batch 24 to embeddings_batch_1750013385_24.parquet
INFO:EmbeddingLogger:✅ Saved batch 24 to embeddings_batch_1750013385_24.parquet
2025-06-15 18:49:45,537 - INFO - Processing Batch 25 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 25 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.82it/s]
2025-06-15 18:49:46,142 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:47,015 - INFO - ✅ Saved batch 25 to embeddings_batch_1750013386_25.parquet
INFO:EmbeddingLogger:✅ Saved batch 25 to embeddings_batch_1750013386_25.parquet
2025-06-15 18:49:47,026 - INFO - Processing Batch 26 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 26 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 17.58it/s]
2025-06-15 18:49:47,607 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:48,684 - INFO - ✅ Saved batch 26 to embeddings_batch_1750013388_26.parquet
INFO:EmbeddingLogger:✅ Saved batch 26 to embeddings_batch_1750013388_26.parquet
2025-06-15 18:49:48,696 - INFO - Processing Batch 27 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 27 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 17.85it/s]
2025-06-15 18:49:49,272 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:50,293 - INFO - ✅ Saved batch 27 to embeddings_batch_1750013390_27.parquet
INFO:EmbeddingLogger:✅ Saved batch 27 to embeddings_batch_1750013390_27.parquet
2025-06-15 18:49:50,303 - INFO - Processing Batch 28 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 28 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.62it/s]
2025-06-15 18:49:50,953 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:52,076 - INFO - ✅ Saved batch 28 to embeddings_batch_1750013392_28.parquet
INFO:EmbeddingLogger:✅ Saved batch 28 to embeddings_batch_1750013392_28.parquet
2025-06-15 18:49:52,086 - INFO - Processing Batch 29 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 29 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.68it/s]
2025-06-15 18:49:52,696 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:53,945 - INFO - ✅ Saved batch 29 to embeddings_batch_1750013393_29.parquet
INFO:EmbeddingLogger:✅ Saved batch 29 to embeddings_batch_1750013393_29.parquet
2025-06-15 18:49:53,956 - INFO - Processing Batch 30 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 30 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  8.39it/s]
2025-06-15 18:49:55,159 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:56,973 - INFO - ✅ Saved batch 30 to embeddings_batch_1750013396_30.parquet
INFO:EmbeddingLogger:✅ Saved batch 30 to embeddings_batch_1750013396_30.parquet
2025-06-15 18:49:56,985 - INFO - Processing Batch 31 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 31 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  7.19it/s]
2025-06-15 18:49:58,384 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:49:59,496 - INFO - ✅ Saved batch 31 to embeddings_batch_1750013399_31.parquet
INFO:EmbeddingLogger:✅ Saved batch 31 to embeddings_batch_1750013399_31.parquet
2025-06-15 18:49:59,506 - INFO - Processing Batch 32 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 32 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.01it/s]
2025-06-15 18:50:00,142 - INFO - Generating embeddings for 5 texts...
INFO:EmbeddingLogger:Generating embeddings for 5 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:00,838 - INFO - ✅ Saved batch 32 to embeddings_batch_1750013400_32.parquet
INFO:EmbeddingLogger:✅ Saved batch 32 to embeddings_batch_1750013400_32.parquet
2025-06-15 18:50:00,850 - INFO - Processing Batch 33 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 33 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 13.96it/s]
2025-06-15 18:50:01,579 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:02,707 - INFO - ✅ Saved batch 33 to embeddings_batch_1750013402_33.parquet
INFO:EmbeddingLogger:✅ Saved batch 33 to embeddings_batch_1750013402_33.parquet
2025-06-15 18:50:02,719 - INFO - Processing Batch 34 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 34 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 13.58it/s]
2025-06-15 18:50:03,472 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:04,835 - INFO - ✅ Saved batch 34 to embeddings_batch_1750013404_34.parquet
INFO:EmbeddingLogger:✅ Saved batch 34 to embeddings_batch_1750013404_34.parquet
2025-06-15 18:50:04,845 - INFO - Processing Batch 35 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 35 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.19it/s]
2025-06-15 18:50:05,479 - INFO - Generating embeddings for 5 texts...
INFO:EmbeddingLogger:Generating embeddings for 5 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:06,147 - INFO - ✅ Saved batch 35 to embeddings_batch_1750013406_35.parquet
INFO:EmbeddingLogger:✅ Saved batch 35 to embeddings_batch_1750013406_35.parquet
2025-06-15 18:50:06,157 - INFO - Processing Batch 36 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 36 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.72it/s]
2025-06-15 18:50:06,849 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:08,379 - INFO - ✅ Saved batch 36 to embeddings_batch_1750013408_36.parquet
INFO:EmbeddingLogger:✅ Saved batch 36 to embeddings_batch_1750013408_36.parquet
2025-06-15 18:50:08,392 - INFO - Processing Batch 37 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 37 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 10.25it/s]
2025-06-15 18:50:09,385 - INFO - Generating embeddings for 5 texts...
INFO:EmbeddingLogger:Generating embeddings for 5 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:10,399 - INFO - ✅ Saved batch 37 to embeddings_batch_1750013410_37.parquet
INFO:EmbeddingLogger:✅ Saved batch 37 to embeddings_batch_1750013410_37.parquet
2025-06-15 18:50:10,411 - INFO - Processing Batch 38 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 38 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  9.06it/s]
2025-06-15 18:50:11,527 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:12,785 - INFO - ✅ Saved batch 38 to embeddings_batch_1750013412_38.parquet
INFO:EmbeddingLogger:✅ Saved batch 38 to embeddings_batch_1750013412_38.parquet
2025-06-15 18:50:12,798 - INFO - Processing Batch 39 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 39 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 18.70it/s]
2025-06-15 18:50:13,344 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:14,352 - INFO - ✅ Saved batch 39 to embeddings_batch_1750013414_39.parquet
INFO:EmbeddingLogger:✅ Saved batch 39 to embeddings_batch_1750013414_39.parquet
2025-06-15 18:50:14,363 - INFO - Processing Batch 40 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 40 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 13.02it/s]
2025-06-15 18:50:15,148 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:16,557 - INFO - ✅ Saved batch 40 to embeddings_batch_1750013416_40.parquet
INFO:EmbeddingLogger:✅ Saved batch 40 to embeddings_batch_1750013416_40.parquet
2025-06-15 18:50:16,572 - INFO - Processing Batch 41 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 41 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.48it/s]
2025-06-15 18:50:17,230 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:18,335 - INFO - ✅ Saved batch 41 to embeddings_batch_1750013418_41.parquet
INFO:EmbeddingLogger:✅ Saved batch 41 to embeddings_batch_1750013418_41.parquet
2025-06-15 18:50:18,345 - INFO - Processing Batch 42 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 42 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 17.86it/s]
2025-06-15 18:50:18,916 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:19,907 - INFO - ✅ Saved batch 42 to embeddings_batch_1750013419_42.parquet
INFO:EmbeddingLogger:✅ Saved batch 42 to embeddings_batch_1750013419_42.parquet
2025-06-15 18:50:19,920 - INFO - Processing Batch 43 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 43 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.36it/s]
2025-06-15 18:50:20,627 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:21,663 - INFO - ✅ Saved batch 43 to embeddings_batch_1750013421_43.parquet
INFO:EmbeddingLogger:✅ Saved batch 43 to embeddings_batch_1750013421_43.parquet
2025-06-15 18:50:21,676 - INFO - Processing Batch 44 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 44 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.20it/s]
2025-06-15 18:50:22,389 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:24,094 - INFO - ✅ Saved batch 44 to embeddings_batch_1750013424_44.parquet
INFO:EmbeddingLogger:✅ Saved batch 44 to embeddings_batch_1750013424_44.parquet
2025-06-15 18:50:24,105 - INFO - Processing Batch 45 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 45 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  9.28it/s]
2025-06-15 18:50:25,198 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:27,125 - INFO - ✅ Saved batch 45 to embeddings_batch_1750013427_45.parquet
INFO:EmbeddingLogger:✅ Saved batch 45 to embeddings_batch_1750013427_45.parquet
2025-06-15 18:50:27,138 - INFO - Processing Batch 46 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 46 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.96it/s]
2025-06-15 18:50:27,780 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:28,787 - INFO - ✅ Saved batch 46 to embeddings_batch_1750013428_46.parquet
INFO:EmbeddingLogger:✅ Saved batch 46 to embeddings_batch_1750013428_46.parquet
2025-06-15 18:50:28,802 - INFO - Processing Batch 47 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 47 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.09it/s]
2025-06-15 18:50:29,478 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:30,482 - INFO - ✅ Saved batch 47 to embeddings_batch_1750013430_47.parquet
INFO:EmbeddingLogger:✅ Saved batch 47 to embeddings_batch_1750013430_47.parquet
2025-06-15 18:50:30,493 - INFO - Processing Batch 48 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 48 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.11it/s]
2025-06-15 18:50:31,134 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:32,010 - INFO - ✅ Saved batch 48 to embeddings_batch_1750013431_48.parquet
INFO:EmbeddingLogger:✅ Saved batch 48 to embeddings_batch_1750013431_48.parquet
2025-06-15 18:50:32,020 - INFO - Processing Batch 49 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 49 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.90it/s]
2025-06-15 18:50:32,630 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:33,722 - INFO - ✅ Saved batch 49 to embeddings_batch_1750013433_49.parquet
INFO:EmbeddingLogger:✅ Saved batch 49 to embeddings_batch_1750013433_49.parquet
2025-06-15 18:50:33,734 - INFO - Processing Batch 50 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 50 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.67it/s]
2025-06-15 18:50:34,348 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:35,468 - INFO - ✅ Saved batch 50 to embeddings_batch_1750013435_50.parquet
INFO:EmbeddingLogger:✅ Saved batch 50 to embeddings_batch_1750013435_50.parquet
2025-06-15 18:50:35,480 - INFO - Processing Batch 51 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 51 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.89it/s]
2025-06-15 18:50:36,167 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:37,084 - INFO - ✅ Saved batch 51 to embeddings_batch_1750013437_51.parquet
INFO:EmbeddingLogger:✅ Saved batch 51 to embeddings_batch_1750013437_51.parquet
2025-06-15 18:50:37,095 - INFO - Processing Batch 52 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 52 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 10.20it/s]
2025-06-15 18:50:38,090 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:39,821 - INFO - ✅ Saved batch 52 to embeddings_batch_1750013439_52.parquet
INFO:EmbeddingLogger:✅ Saved batch 52 to embeddings_batch_1750013439_52.parquet
2025-06-15 18:50:39,831 - INFO - Processing Batch 53 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 53 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  9.27it/s]
2025-06-15 18:50:40,919 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:42,467 - INFO - ✅ Saved batch 53 to embeddings_batch_1750013442_53.parquet
INFO:EmbeddingLogger:✅ Saved batch 53 to embeddings_batch_1750013442_53.parquet
2025-06-15 18:50:42,477 - INFO - Processing Batch 54 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 54 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 17.56it/s]
2025-06-15 18:50:43,055 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:44,042 - INFO - ✅ Saved batch 54 to embeddings_batch_1750013444_54.parquet
INFO:EmbeddingLogger:✅ Saved batch 54 to embeddings_batch_1750013444_54.parquet
2025-06-15 18:50:44,050 - INFO - Processing Batch 55 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 55 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.29it/s]
2025-06-15 18:50:44,715 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:45,983 - INFO - ✅ Saved batch 55 to embeddings_batch_1750013445_55.parquet
INFO:EmbeddingLogger:✅ Saved batch 55 to embeddings_batch_1750013445_55.parquet
2025-06-15 18:50:45,996 - INFO - Processing Batch 56 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 56 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 17.10it/s]
2025-06-15 18:50:46,597 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:47,856 - INFO - ✅ Saved batch 56 to embeddings_batch_1750013447_56.parquet
INFO:EmbeddingLogger:✅ Saved batch 56 to embeddings_batch_1750013447_56.parquet
2025-06-15 18:50:47,868 - INFO - Processing Batch 57 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 57 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.98it/s]
2025-06-15 18:50:48,471 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:49,847 - INFO - ✅ Saved batch 57 to embeddings_batch_1750013449_57.parquet
INFO:EmbeddingLogger:✅ Saved batch 57 to embeddings_batch_1750013449_57.parquet
2025-06-15 18:50:49,856 - INFO - Processing Batch 58 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 58 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.03it/s]
2025-06-15 18:50:50,491 - INFO - Generating embeddings for 5 texts...
INFO:EmbeddingLogger:Generating embeddings for 5 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:51,203 - INFO - ✅ Saved batch 58 to embeddings_batch_1750013451_58.parquet
INFO:EmbeddingLogger:✅ Saved batch 58 to embeddings_batch_1750013451_58.parquet
2025-06-15 18:50:51,212 - INFO - Processing Batch 59 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 59 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.17it/s]
2025-06-15 18:50:51,927 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:53,697 - INFO - ✅ Saved batch 59 to embeddings_batch_1750013453_59.parquet
INFO:EmbeddingLogger:✅ Saved batch 59 to embeddings_batch_1750013453_59.parquet
2025-06-15 18:50:53,712 - INFO - Processing Batch 60 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 60 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  9.92it/s]
2025-06-15 18:50:54,730 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:56,407 - INFO - ✅ Saved batch 60 to embeddings_batch_1750013456_60.parquet
INFO:EmbeddingLogger:✅ Saved batch 60 to embeddings_batch_1750013456_60.parquet
2025-06-15 18:50:56,420 - INFO - Processing Batch 61 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 61 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 17.24it/s]
2025-06-15 18:50:57,013 - INFO - Generating embeddings for 4 texts...
INFO:EmbeddingLogger:Generating embeddings for 4 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:57,600 - INFO - ✅ Saved batch 61 to embeddings_batch_1750013457_61.parquet
INFO:EmbeddingLogger:✅ Saved batch 61 to embeddings_batch_1750013457_61.parquet
2025-06-15 18:50:57,611 - INFO - Processing Batch 62 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 62 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 17.41it/s]
2025-06-15 18:50:58,197 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:50:59,169 - INFO - ✅ Saved batch 62 to embeddings_batch_1750013459_62.parquet
INFO:EmbeddingLogger:✅ Saved batch 62 to embeddings_batch_1750013459_62.parquet
2025-06-15 18:50:59,177 - INFO - Processing Batch 63 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 63 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.85it/s]
2025-06-15 18:50:59,867 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:01,019 - INFO - ✅ Saved batch 63 to embeddings_batch_1750013460_63.parquet
INFO:EmbeddingLogger:✅ Saved batch 63 to embeddings_batch_1750013460_63.parquet
2025-06-15 18:51:01,028 - INFO - Processing Batch 64 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 64 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.53it/s]
2025-06-15 18:51:01,686 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:02,705 - INFO - ✅ Saved batch 64 to embeddings_batch_1750013462_64.parquet
INFO:EmbeddingLogger:✅ Saved batch 64 to embeddings_batch_1750013462_64.parquet
2025-06-15 18:51:02,717 - INFO - Processing Batch 65 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 65 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.57it/s]
2025-06-15 18:51:03,330 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:04,391 - INFO - ✅ Saved batch 65 to embeddings_batch_1750013464_65.parquet
INFO:EmbeddingLogger:✅ Saved batch 65 to embeddings_batch_1750013464_65.parquet
2025-06-15 18:51:04,403 - INFO - Processing Batch 66 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 66 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 17.46it/s]
2025-06-15 18:51:04,988 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:05,828 - INFO - ✅ Saved batch 66 to embeddings_batch_1750013465_66.parquet
INFO:EmbeddingLogger:✅ Saved batch 66 to embeddings_batch_1750013465_66.parquet
2025-06-15 18:51:05,836 - INFO - Processing Batch 67 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 67 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.02it/s]
2025-06-15 18:51:06,469 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:08,245 - INFO - ✅ Saved batch 67 to embeddings_batch_1750013468_67.parquet
INFO:EmbeddingLogger:✅ Saved batch 67 to embeddings_batch_1750013468_67.parquet
2025-06-15 18:51:08,255 - INFO - Processing Batch 68 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 68 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  9.72it/s]
2025-06-15 18:51:09,297 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:10,856 - INFO - ✅ Saved batch 68 to embeddings_batch_1750013470_68.parquet
INFO:EmbeddingLogger:✅ Saved batch 68 to embeddings_batch_1750013470_68.parquet
2025-06-15 18:51:10,865 - INFO - Processing Batch 69 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 69 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 16.09it/s]
2025-06-15 18:51:11,505 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:12,508 - INFO - ✅ Saved batch 69 to embeddings_batch_1750013472_69.parquet
INFO:EmbeddingLogger:✅ Saved batch 69 to embeddings_batch_1750013472_69.parquet
2025-06-15 18:51:12,519 - INFO - Processing Batch 70 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 70 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.87it/s]
2025-06-15 18:51:13,204 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:14,565 - INFO - ✅ Saved batch 70 to embeddings_batch_1750013474_70.parquet
INFO:EmbeddingLogger:✅ Saved batch 70 to embeddings_batch_1750013474_70.parquet
2025-06-15 18:51:14,576 - INFO - Processing Batch 71 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 71 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.04it/s]
2025-06-15 18:51:15,305 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:16,383 - INFO - ✅ Saved batch 71 to embeddings_batch_1750013476_71.parquet
INFO:EmbeddingLogger:✅ Saved batch 71 to embeddings_batch_1750013476_71.parquet
2025-06-15 18:51:16,397 - INFO - Processing Batch 72 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 72 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 13.20it/s]
2025-06-15 18:51:17,167 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:18,260 - INFO - ✅ Saved batch 72 to embeddings_batch_1750013478_72.parquet
INFO:EmbeddingLogger:✅ Saved batch 72 to embeddings_batch_1750013478_72.parquet
2025-06-15 18:51:18,273 - INFO - Processing Batch 73 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 73 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 17.10it/s]
2025-06-15 18:51:18,866 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:19,879 - INFO - ✅ Saved batch 73 to embeddings_batch_1750013479_73.parquet
INFO:EmbeddingLogger:✅ Saved batch 73 to embeddings_batch_1750013479_73.parquet
2025-06-15 18:51:19,889 - INFO - Processing Batch 74 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 74 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.21it/s]
2025-06-15 18:51:20,604 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:22,273 - INFO - ✅ Saved batch 74 to embeddings_batch_1750013482_74.parquet
INFO:EmbeddingLogger:✅ Saved batch 74 to embeddings_batch_1750013482_74.parquet
2025-06-15 18:51:22,287 - INFO - Processing Batch 75 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 75 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  9.17it/s]
2025-06-15 18:51:23,397 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:24,831 - INFO - ✅ Saved batch 75 to embeddings_batch_1750013484_75.parquet
INFO:EmbeddingLogger:✅ Saved batch 75 to embeddings_batch_1750013484_75.parquet
2025-06-15 18:51:24,845 - INFO - Processing Batch 76 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 76 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  9.86it/s]
2025-06-15 18:51:25,873 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:27,198 - INFO - ✅ Saved batch 76 to embeddings_batch_1750013487_76.parquet
INFO:EmbeddingLogger:✅ Saved batch 76 to embeddings_batch_1750013487_76.parquet
2025-06-15 18:51:27,211 - INFO - Processing Batch 77 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 77 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.80it/s]
2025-06-15 18:51:27,855 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:28,779 - INFO - ✅ Saved batch 77 to embeddings_batch_1750013488_77.parquet
INFO:EmbeddingLogger:✅ Saved batch 77 to embeddings_batch_1750013488_77.parquet
2025-06-15 18:51:28,790 - INFO - Processing Batch 78 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 78 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.05it/s]
2025-06-15 18:51:29,513 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:30,690 - INFO - ✅ Saved batch 78 to embeddings_batch_1750013490_78.parquet
INFO:EmbeddingLogger:✅ Saved batch 78 to embeddings_batch_1750013490_78.parquet
2025-06-15 18:51:30,705 - INFO - Processing Batch 79 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 79 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.05it/s]
2025-06-15 18:51:31,430 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:32,578 - INFO - ✅ Saved batch 79 to embeddings_batch_1750013492_79.parquet
INFO:EmbeddingLogger:✅ Saved batch 79 to embeddings_batch_1750013492_79.parquet
2025-06-15 18:51:32,588 - INFO - Processing Batch 80 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 80 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.83it/s]
2025-06-15 18:51:33,230 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:34,108 - INFO - ✅ Saved batch 80 to embeddings_batch_1750013494_80.parquet
INFO:EmbeddingLogger:✅ Saved batch 80 to embeddings_batch_1750013494_80.parquet
2025-06-15 18:51:34,119 - INFO - Processing Batch 81 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 81 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 11.99it/s]
2025-06-15 18:51:34,966 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:36,524 - INFO - ✅ Saved batch 81 to embeddings_batch_1750013496_81.parquet
INFO:EmbeddingLogger:✅ Saved batch 81 to embeddings_batch_1750013496_81.parquet
2025-06-15 18:51:36,541 - INFO - Processing Batch 82 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 82 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:01<00:00,  9.32it/s]
2025-06-15 18:51:37,625 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:39,791 - INFO - ✅ Saved batch 82 to embeddings_batch_1750013499_82.parquet
INFO:EmbeddingLogger:✅ Saved batch 82 to embeddings_batch_1750013499_82.parquet
2025-06-15 18:51:39,808 - INFO - Processing Batch 83 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 83 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 11.74it/s]
2025-06-15 18:51:40,671 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:41,859 - INFO - ✅ Saved batch 83 to embeddings_batch_1750013501_83.parquet
INFO:EmbeddingLogger:✅ Saved batch 83 to embeddings_batch_1750013501_83.parquet
2025-06-15 18:51:41,871 - INFO - Processing Batch 84 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 84 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.51it/s]
2025-06-15 18:51:42,573 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:43,609 - INFO - ✅ Saved batch 84 to embeddings_batch_1750013503_84.parquet
INFO:EmbeddingLogger:✅ Saved batch 84 to embeddings_batch_1750013503_84.parquet
2025-06-15 18:51:43,620 - INFO - Processing Batch 85 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 85 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.89it/s]
2025-06-15 18:51:44,263 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:45,402 - INFO - ✅ Saved batch 85 to embeddings_batch_1750013505_85.parquet
INFO:EmbeddingLogger:✅ Saved batch 85 to embeddings_batch_1750013505_85.parquet
2025-06-15 18:51:45,422 - INFO - Processing Batch 86 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 86 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.79it/s]
2025-06-15 18:51:46,069 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:47,395 - INFO - ✅ Saved batch 86 to embeddings_batch_1750013507_86.parquet
INFO:EmbeddingLogger:✅ Saved batch 86 to embeddings_batch_1750013507_86.parquet
2025-06-15 18:51:47,408 - INFO - Processing Batch 87 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 87 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.96it/s]
2025-06-15 18:51:48,089 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:49,222 - INFO - ✅ Saved batch 87 to embeddings_batch_1750013509_87.parquet
INFO:EmbeddingLogger:✅ Saved batch 87 to embeddings_batch_1750013509_87.parquet
2025-06-15 18:51:49,233 - INFO - Processing Batch 88 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 88 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 15.34it/s]
2025-06-15 18:51:49,897 - INFO - Generating embeddings for 9 texts...
INFO:EmbeddingLogger:Generating embeddings for 9 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:51,588 - INFO - ✅ Saved batch 88 to embeddings_batch_1750013511_88.parquet
INFO:EmbeddingLogger:✅ Saved batch 88 to embeddings_batch_1750013511_88.parquet
2025-06-15 18:51:51,597 - INFO - Processing Batch 89 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 89 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 10.01it/s]
2025-06-15 18:51:52,605 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:53,994 - INFO - ✅ Saved batch 89 to embeddings_batch_1750013513_89.parquet
INFO:EmbeddingLogger:✅ Saved batch 89 to embeddings_batch_1750013513_89.parquet
2025-06-15 18:51:54,009 - INFO - Processing Batch 90 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 90 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 10.43it/s]
2025-06-15 18:51:54,988 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:56,084 - INFO - ✅ Saved batch 90 to embeddings_batch_1750013516_90.parquet
INFO:EmbeddingLogger:✅ Saved batch 90 to embeddings_batch_1750013516_90.parquet
2025-06-15 18:51:56,097 - INFO - Processing Batch 91 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 91 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 13.79it/s]
2025-06-15 18:51:56,832 - INFO - Generating embeddings for 6 texts...
INFO:EmbeddingLogger:Generating embeddings for 6 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:57,708 - INFO - ✅ Saved batch 91 to embeddings_batch_1750013517_91.parquet
INFO:EmbeddingLogger:✅ Saved batch 91 to embeddings_batch_1750013517_91.parquet
2025-06-15 18:51:57,722 - INFO - Processing Batch 92 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 92 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.05it/s]
2025-06-15 18:51:58,446 - INFO - Generating embeddings for 10 texts...
INFO:EmbeddingLogger:Generating embeddings for 10 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:51:59,856 - INFO - ✅ Saved batch 92 to embeddings_batch_1750013519_92.parquet
INFO:EmbeddingLogger:✅ Saved batch 92 to embeddings_batch_1750013519_92.parquet
2025-06-15 18:51:59,868 - INFO - Processing Batch 93 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 93 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 14.97it/s]
2025-06-15 18:52:00,552 - INFO - Generating embeddings for 8 texts...
INFO:EmbeddingLogger:Generating embeddings for 8 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:52:01,629 - INFO - ✅ Saved batch 93 to embeddings_batch_1750013521_93.parquet
INFO:EmbeddingLogger:✅ Saved batch 93 to embeddings_batch_1750013521_93.parquet
2025-06-15 18:52:01,639 - INFO - Processing Batch 94 (10 pages)...
INFO:EmbeddingLogger:Processing Batch 94 (10 pages)...
Extracting Text: 100%|██████████| 10/10 [00:00<00:00, 13.23it/s]
2025-06-15 18:52:02,405 - INFO - Generating embeddings for 7 texts...
INFO:EmbeddingLogger:Generating embeddings for 7 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:52:03,421 - INFO - ✅ Saved batch 94 to embeddings_batch_1750013523_94.parquet
INFO:EmbeddingLogger:✅ Saved batch 94 to embeddings_batch_1750013523_94.parquet
2025-06-15 18:52:03,433 - INFO - Processing Batch 95 (1 pages)...
INFO:EmbeddingLogger:Processing Batch 95 (1 pages)...
Extracting Text: 100%|██████████| 1/1 [00:00<00:00, 15.44it/s]
2025-06-15 18:52:03,509 - INFO - Generating embeddings for 1 texts...
INFO:EmbeddingLogger:Generating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-06-15 18:52:03,731 - INFO - ✅ Saved batch 95 to embeddings_batch_1750013523_95.parquet
INFO:EmbeddingLogger:✅ Saved batch 95 to embeddings_batch_1750013523_95.parquet
2025-06-15 18:52:03,737 - INFO - All new batches processed successfully.
INFO:EmbeddingLogger:All new batches processed successfully.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://d7b296b1fb8709aeb8.gradio.live
