In [None]:
from google.colab import drive
drive.mount('/content/drive')

# pip install -r requirements.txt first

In [None]:
# File: notebooks/link_crawler_ui.ipynb - Cell 1

# Installing necessary packages
# !pip install gradio pandas requests tqdm beautifulsoup4 lxml -q
# !pip install fireducks

import sys
import os
import warnings

# Suppress common warnings if desired
warnings.filterwarnings(
    "ignore", category=FutureWarning, module="huggingface_hub.file_download"
)

# Add the project root to the Python path
project_root = "/content/drive/My Drive/WebKnoGraph" # Explicitly set

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"Project root added to sys.path: {project_root}")
print(f"Current working directory: {os.getcwd()}")
print(f"sys.path: {sys.path}")

# Google Colab Drive Mount
try:
    from google.colab import drive
    if not os.path.exists("/content/drive/My Drive"):
        drive.mount("/content/drive/")
        print("Google Drive mounted successfully.")
    else:
        print("Google Drive already mounted.")
except ImportError:
    print("Not running in Google Colab environment. Skipping Google Drive mount.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")


# Import from your refactored backend and shared modules
import gradio as gr
import io
import fireducks.pandas as pd # Using fireducks.pandas as specified
from urllib.parse import urlparse

# Specific imports for the Link Graph Extractor
from src.backend.config.link_crawler_config import LinkCrawlerConfig # Using the new config
from src.backend.data.link_graph_repository import LinkGraphStateManager # Using the new state manager
from src.backend.utils.strategies import VisitedUrlManager, BFSCrawlingStrategy, DFSCrawlingStrategy # Reusing strategies
from src.backend.utils.http import HttpClient # Reusing HttpClient
from src.backend.utils.link_url import LinkUrlFilter, LinkExtractorForEdges # Using the new URL components
from src.backend.services.link_crawler_service import EdgeCrawler # Using the new orchestrator
from src.shared.logging_config import ConsoleAndGradioLogger # Using the generic logger

print("All modules imported successfully!")

In [None]:
# File: notebooks/link_crawler_ui.ipynb - Cell 2

def run_edge_crawler_interface(initial_start_url: str, crawling_scope_path: str, saving_scope_path: str,
                               crawling_strategy_type: str, state_db_path_input: str,
                               edge_list_path_input: str, max_pages_to_crawl: int):
    log_stream = io.StringIO()
    logger = ConsoleAndGradioLogger(log_stream, logger_name="EdgeCrawlerLogger") # Pass logger_name

    try:
        base_domain = urlparse(initial_start_url).netloc
        if not base_domain: raise ValueError("Invalid Initial Start URL.")

        config = LinkCrawlerConfig( # Using new config class
            initial_start_url=initial_start_url,
            crawling_scope_path=crawling_scope_path,
            saving_scope_path=saving_scope_path,
            state_db_path=state_db_path_input,
            edge_list_path=edge_list_path_input,
            max_pages_to_crawl=max_pages_to_crawl,
            base_domain=base_domain
        )
        os.makedirs(os.path.dirname(config.edge_list_path), exist_ok=True) # Ensure output dir exists
        yield "Initializing...", log_stream.getvalue(), ""

        state_manager = LinkGraphStateManager(config.state_db_path, config.edge_list_path, logger) # Using new state manager
        visited_manager = VisitedUrlManager()

        logger.info("Rebuilding visited set from existing edge list CSV...")
        # Load visited URLs from CSV using the state manager
        rebuilt_visited_urls = state_manager.load_visited_from_edges()
        for url in rebuilt_visited_urls:
            visited_manager.add(url)
        if rebuilt_visited_urls:
            logger.info(f"Rebuilt visited set with {visited_manager.size()} URLs from CSV.")
        else:
            logger.warning("No previously processed edges found. Starting with a fresh visited set.")


        if crawling_strategy_type == 'BFS':
            crawling_strategy = BFSCrawlingStrategy(visited_manager, logger) # Reusing BFSCrawlingStrategy
        else:
            crawling_strategy = DFSCrawlingStrategy(visited_manager, logger) # Reusing DFSCrawlingStrategy

        loaded_frontier = state_manager.load_frontier()
        unvisited_frontier = [info for info in loaded_frontier if not visited_manager.contains(info[0])]

        if unvisited_frontier:
            crawling_strategy.prime_with_frontier(unvisited_frontier)
            logger.info(f"Primed frontier with {len(unvisited_frontier)} URLs from DB.")
        elif not visited_manager.contains(config.initial_start_url):
            crawling_strategy.add_links([(config.initial_start_url, 0)])
            logger.info(f"Added initial URL {config.initial_start_url} to frontier.")
        else:
            logger.info(f"Initial URL {config.initial_start_url} already visited. No new URLs to start with from DB or initial URL.")


        url_filter = LinkUrlFilter(config.crawling_scope_path, config.base_domain) # Using new LinkUrlFilter
        link_extractor = LinkExtractorForEdges(url_filter) # Using new LinkExtractorForEdges
        # HttpClient expects a config object that has 'user_agents', 'max_retries_request', 'request_timeout', 'min_request_delay' attributes. LinkCrawlerConfig has these.
        crawler = EdgeCrawler(config, crawling_strategy, state_manager, HttpClient(config, logger), link_extractor, logger) # Using new EdgeCrawler

        final_status = ""
        for status_msg in crawler.crawl():
            final_status = status_msg
            yield status_msg, log_stream.getvalue(), ""

        logger.info("Generating final summary from CSV file...")
        summary_md = f"## Crawl Session Finished\n\n- **Status**: {final_status}\n- **Edge List Location**: `{config.edge_list_path}`"
        try:
            if os.path.exists(config.edge_list_path):
                edge_df = pd.read_csv(config.edge_list_path)
                num_edges = len(edge_df)
                num_nodes = len(pd.concat([edge_df['FROM'], edge_df['TO']]).unique())
                summary_md += f"\n- **Total Unique Pages (Nodes):** {num_nodes}\n- **Total Links (Edges):** {num_edges}"
        except Exception as e:
            logger.error(f"Could not generate summary from CSV: {e}")
            summary_md += "\n\n**Could not generate summary from CSV file.**"

        yield final_status, log_stream.getvalue(), summary_md

    except Exception as e:
        logger.exception(f"A critical error occurred: {e}")
        yield "Crawl Failed!", log_stream.getvalue(), f"**Error:** {e}"

In [None]:
# File: notebooks/link_crawler_ui.ipynb - Cell 3

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🕸️ Link Graph Extractor")
    gr.Markdown("This tool crawls a website to produce a simple `FROM, TO` list of all hyperlinks, saved as a CSV file.")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## Configuration")
            initial_url_input = gr.Textbox(label="Initial Start URL", value=LinkCrawlerConfig.initial_start_url) # Uses LinkCrawlerConfig
            max_pages_input = gr.Number(label="Maximum Pages to Process", value=LinkCrawlerConfig.max_pages_to_crawl, minimum=1, step=100) # Uses LinkCrawlerConfig
            crawling_strategy_radio = gr.Radio(choices=['BFS', 'DFS'], label="Crawling Strategy", value='BFS')

            gr.Markdown("### 📜 Scopes")
            crawling_scope_path_input = gr.Textbox(label="Crawling Scope Path", value=LinkCrawlerConfig.crawling_scope_path, info="The 'playground'. Set to '/' to explore the entire site.") # Uses LinkCrawlerConfig
            saving_scope_path_input = gr.Textbox(label="Saving Scope Path", value=LinkCrawlerConfig.saving_scope_path, info="The 'rulebook'. Only save links where FROM and TO are in this path.") # Uses LinkCrawlerConfig

            gr.Markdown("### 💾 Storage Paths")
            state_db_path_input = gr.Textbox(label="Crawl State DB Path (SQLite)", value=LinkCrawlerConfig.state_db_path) # Uses LinkCrawlerConfig
            edge_list_path_input = gr.Textbox(label="Output Edge List Path (CSV)", value=LinkCrawlerConfig.edge_list_path) # Uses LinkCrawlerConfig

            start_button = gr.Button("🚀 Start Extraction", variant="primary")

        with gr.Column(scale=2):
            gr.Markdown("## Status & Results")
            status_message_output = gr.Textbox(label="Status Message", interactive=False)
            logs_output = gr.Textbox(label="Crawler Logs", interactive=False, lines=15, max_lines=20)
            summary_output = gr.Markdown("---")

    start_button.click(
        fn=run_edge_crawler_interface,
        inputs=[
            initial_url_input,
            crawling_scope_path_input,
            saving_scope_path_input,
            crawling_strategy_radio,
            state_db_path_input,
            edge_list_path_input,
            max_pages_input
        ],
        outputs=[status_message_output, logs_output, summary_output]
    )

In [None]:
# File: notebooks/link_crawler_ui.ipynb - Cell 4

# --- Launch the Application ---
if __name__ == '__main__':
    try:
        from google.colab import drive
        # It's better to mount once at the very start of the notebook
        # or main.py. If it's already mounted, no need to force_remount unless necessary.
        # Check if already mounted before attempting to mount again (as in Cell 1)
        if not os.path.exists("/content/drive/My Drive"):
            drive.mount("/content/drive/")
            print("Google Drive mounted successfully.")
        else:
            print("Google Drive already mounted.")

        demo.launch(debug=True, share=True)
    except Exception as e:
        print(f"Could not launch Gradio demo in this environment: {e}")