In [None]:
from google.colab import drive

drive.mount("/content/drive")

# pip install -r requirements.txt first, before continuing with the rest of the code

In [None]:
# Installing necessary packages!
# These should ideally be in requirements.txt and installed once for the environment.
# But for a notebook that's meant to be self-contained for easy sharing/running,
# it's common to keep them here.
# !pip install pandas pyarrow duckdb gradio requests beautifulsoup4 lxml tqdm -q
# !pip install fireducks

import sys
import os

# --- IMPORTANT CHANGE HERE ---
# Define your Google Drive root path where 'master_july_2025' folder is located.
# This assumes 'master_july_2025' is directly inside 'My Drive'.
# If 'master_july_2025' is inside another subfolder, adjust this path.
# The project root should be the directory *containing* the 'src' folder.
google_drive_base_path = "/content/drive/My Drive/WebKnoGraph/"

# Now, define the project_root to be this specific path
# This assumes the structure is /content/drive/My Drive/WebKnoGraph/src/...
# If src is directly under WebKnoGraph, then WebKnoGraph is the correct root to add to path
project_root = os.path.abspath(google_drive_base_path)

# Add the project root to the Python path
# This allows imports like 'from src.backend...' to work if src is directly under project_root
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root added to sys.path: {project_root}")
print(f"Current working directory: {os.getcwd()}")
print(f"sys.path: {sys.path}")


# Google Colab Drive Mount (moved here from the main code)
try:
    from google.colab import drive

    # Check if already mounted before attempting to mount again
    if not os.path.exists("/content/drive/My Drive"):
        drive.mount("/content/drive/")
        print("Google Drive mounted successfully.")
    else:
        print("Google Drive already mounted.")
except ImportError:
    print("Not running in Google Colab environment. Skipping Google Drive mount.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")

# Import from your refactored backend and shared modules
import gradio as gr
import io
import duckdb
from urllib.parse import urlparse

# Specific imports from your project structure
# Assuming your structure is project_root/src/backend/...
try:
    from src.backend.config.crawler_config import CrawlerConfig
    from src.backend.utils.strategies import (
        VisitedUrlManager,
        BFSCrawlingStrategy,
        DFSCrawlingStrategy,
    )
    from src.backend.data.repositories import CrawlStateRepository
    from src.backend.utils.http import HttpClient
    from src.backend.utils.url import UrlFilter, LinkExtractor
    from src.backend.services.crawler_service import WebCrawler
    from src.shared.logging_config import ConsoleAndGradioLogger

    print("All modules imported successfully!")
except ModuleNotFoundError as e:
    print(f"Module import failed: {e}")
    print(
        "Please ensure your project structure under Google Drive is correct and matches the import paths."
    )
    print(f"Expected project root: {project_root}")
    print(
        "Check if 'src' directory exists directly under the project root and contains the necessary subdirectories (backend, shared)."
    )

In [None]:
def run_gradio_crawler_interface(
    initial_start_url: str,
    allowed_path_segment: str,
    crawling_strategy_type: str,
    state_db_path_input: str,
    parquet_path_input: str,
    max_pages_to_crawl: int,
):
    log_stream = io.StringIO()
    logger = ConsoleAndGradioLogger(log_stream)

    try:
        base_domain = urlparse(initial_start_url).netloc
        if not base_domain:
            raise ValueError("Invalid Initial Start URL.")

        config = CrawlerConfig(
            initial_start_url=initial_start_url,
            allowed_path_segment=allowed_path_segment,
            state_db_path=state_db_path_input,
            parquet_path=parquet_path_input,
            max_pages_to_crawl=max_pages_to_crawl,
            base_domain=base_domain,
        )
        os.makedirs(config.parquet_path, exist_ok=True)
        yield (
            "Initializing...",
            log_stream.getvalue(),
            "### Save Events Log\n\n- Waiting for first save event...",
            "",
        )

        state_repository = CrawlStateRepository(config.state_db_path, logger)
        visited_manager = VisitedUrlManager()

        logger.info("Rebuilding visited set from existing Parquet data...")
        try:
            parquet_glob_path = os.path.join(config.parquet_path, "**", "*.parquet")
            if os.sep != "/":
                parquet_glob_path = parquet_glob_path.replace(os.sep, "/")
            visited_urls_df = duckdb.query(
                f"SELECT DISTINCT URL FROM read_parquet('{parquet_glob_path}')"
            ).to_df()
            for url in visited_urls_df["URL"]:
                visited_manager.add(url)
            logger.info(f"Rebuilt visited set with {visited_manager.size()} URLs.")
        except Exception as e:
            logger.warning(
                f"Could not rebuild visited set from Parquet (may be a new crawl or empty directory): {e}"
            )

        strategy_class = (
            BFSCrawlingStrategy
            if crawling_strategy_type == "BFS"
            else DFSCrawlingStrategy
        )
        crawling_strategy = strategy_class(visited_manager, logger)

        loaded_frontier = state_repository.load_frontier()
        unvisited_frontier = [
            info for info in loaded_frontier if not visited_manager.contains(info[0])
        ]

        if unvisited_frontier:
            crawling_strategy.prime_with_frontier(unvisited_frontier)
            logger.info(f"Primed frontier with {len(unvisited_frontier)} URLs from DB.")
        elif not visited_manager.contains(config.initial_start_url):
            crawling_strategy.add_links([(config.initial_start_url, 0)])
            logger.info(f"Added initial URL {config.initial_start_url} to frontier.")
        else:
            logger.info(
                f"Initial URL {config.initial_start_url} already visited. No new URLs to start with from DB or initial URL."
            )

        http_client = HttpClient(config, logger)
        url_filter = UrlFilter(config.allowed_path_segment, config.base_domain)
        link_extractor = LinkExtractor(url_filter, config.allowed_query_params)

        crawler = WebCrawler(
            config,
            crawling_strategy,
            state_repository,
            http_client,
            url_filter,
            link_extractor,
            logger,
        )

        final_status = ""
        save_events_log = ["### Save Events Log"]

        for event in crawler.crawl():
            status_msg = event.get("status")
            save_event = event.get("save_event")

            final_status = status_msg
            if save_event:
                save_events_log.append(f"- {save_event}")
            yield status_msg, log_stream.getvalue(), "\n".join(save_events_log), ""

        logger.info("Generating final summary from Parquet data...")
        final_save_events = "\n".join(save_events_log)
        summary_md = f"## Crawl Session Finished\n\n- **Status**: {final_status}\n- **Crawled Data Location**: `{config.parquet_path}`"
        try:
            parquet_glob_path = os.path.join(config.parquet_path, "**", "*.parquet")
            if os.sep != "/":
                parquet_glob_path = parquet_glob_path.replace(os.sep, "/")
            summary_df = duckdb.query(
                f"SELECT CASE WHEN Status_Code >= 200 AND Status_Code < 300 THEN 'Success (Content Saved)' WHEN Status_Code >= 300 AND Status_Code < 400 THEN 'Redirect' ELSE 'Error / Other' END AS Category, COUNT(*) as Total FROM read_parquet('{parquet_glob_path}') GROUP BY Category ORDER BY Total DESC"
            ).to_df()
            total_urls = summary_df["Total"].sum()
            summary_md += f"\n- **Total URLs in Parquet Dataset**: {total_urls}\n\n### Crawl Summary by Category\n\n"
            summary_md += summary_df.to_markdown(index=False)
        except Exception as e:
            logger.error(f"Could not generate summary from Parquet: {e}")
            summary_md += "\n\n**Could not generate summary from Parquet data.**"

        yield final_status, log_stream.getvalue(), final_save_events, summary_md

    except Exception as e:
        logger.exception(f"A critical error occurred: {e}")
        yield "Crawl Failed!", log_stream.getvalue(), "", f"**Error:** {e}"

In [None]:
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🕸️ Memory-Optimized Parquet Web Crawler")
    gr.Markdown(
        "This crawler saves data to a partitioned Parquet dataset and uses SQLite only to manage the crawl state."
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## Crawler Configuration")
            initial_url_input = gr.Textbox(
                label="Initial Start URL", value=CrawlerConfig.initial_start_url
            )
            allowed_path_input = gr.Textbox(
                label="Allowed Path Segment", value=CrawlerConfig.allowed_path_segment
            )
            crawling_strategy_radio = gr.Radio(
                choices=["BFS", "DFS"], label="Crawling Strategy", value="BFS"
            )
            gr.Markdown(
                """
                - **BFS (Breadth-First Search)**: Ideal for performing a wide crawl across a website, exploring all links at the current depth level before moving to the next.
                - **DFS (Depth-First Search)**: More suitable for targeting specific folders or branches of a website, exploring as deeply as possible along one path before backtracking.
                """
            )
            max_pages_input = gr.Number(
                label="Maximum Pages to Crawl (per session)",
                value=CrawlerConfig.max_pages_to_crawl,
                minimum=1,
                step=100,
            )
            gr.Markdown("### Storage Paths")
            state_db_path_input = gr.Textbox(
                label="Crawl State DB Path (SQLite)", value=CrawlerConfig.state_db_path
            )
            parquet_path_input = gr.Textbox(
                label="Crawled Data Path (Parquet)", value=CrawlerConfig.parquet_path
            )
        with gr.Column(scale=2):
            gr.Markdown("## Actions and Status")
            start_button = gr.Button("🚀 Start Crawl", variant="primary")
            status_message_output = gr.Textbox(
                label="Status Message", interactive=False
            )
            logs_output = gr.Textbox(
                label="Crawler Logs", interactive=False, lines=15, max_lines=20
            )
            with gr.Row():
                save_events_output = gr.Markdown("### Save Events Log")
                summary_output = gr.Markdown("---")

    start_button.click(
        fn=run_gradio_crawler_interface,
        inputs=[
            initial_url_input,
            allowed_path_input,
            crawling_strategy_radio,
            state_db_path_input,
            parquet_path_input,
            max_pages_input,
        ],
        outputs=[
            status_message_output,
            logs_output,
            save_events_output,
            summary_output,
        ],
    )

demo.launch(debug=True)