In [None]:
# Installing necessary packages
!pip install gradio pandas requests tqdm beautifulsoup4 lxml -q
!pip install fireducks

In [None]:
#
#
# --- Imports and Setup ---
#
#

import gradio as gr
import sqlite3

# import pandas as pd
import fireducks.pandas as pd

import requests
import time
import logging
import random
import re
import io
import os
import gc
from urllib.parse import urlparse, urljoin
from collections import deque
from abc import ABC, abstractmethod
from tqdm import tqdm
from dataclasses import dataclass, field
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

#
#
# --- Google Colab Drive Mount ---
#
#

try:
    from google.colab import drive

    drive.mount("/content/drive/")
    print("Google Drive mounted successfully.")
except ImportError:
    print("Not running in Google Colab environment. Skipping Google Drive mount.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")

#
#
# --- Configuration ---
#
#


@dataclass
class CrawlerConfig:
    state_db_path: str = (
        "/content/drive/My Drive/master_july_2025/data/link_crawler_state.db"
    )
    edge_list_path: str = (
        "/content/drive/My Drive/master_july_2025/data/link_graph_edges.csv"
    )
    min_request_delay: float = 1.0
    max_pages_to_crawl: int = 1000
    save_interval_edges: int = 250
    max_retries_request: int = 3
    max_redirects: int = 2
    request_timeout: int = 15
    initial_start_url: str = "https://kalicube.com/"
    # The "playground" for the crawler. Set to '/' to explore the whole site.
    crawling_scope_path: str = "/"
    # The rule for what gets saved. Only edges within this path will be recorded.
    saving_scope_path: str = "/blog/"
    user_agents: list[str] = field(
        default_factory=lambda: [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        ]
    )
    base_domain: str = ""


#
#
# --- Logging and Core Interfaces ---
#
#
class ILogger(ABC):
    @abstractmethod
    def info(self, message: str):
        pass

    @abstractmethod
    def error(self, message: str):
        pass

    @abstractmethod
    def exception(self, message: str):
        pass


class GradioLogHandler(logging.Handler):
    def __init__(self, log_output_stream: io.StringIO):
        super().__init__()
        self.log_output_stream = log_output_stream
        self.setFormatter(
            logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
        )

    def emit(self, record):
        log_entry = self.format(record)
        self.log_output_stream.write(log_entry + "\n")
        self.log_output_stream.flush()


class ConsoleAndGradioLogger(ILogger):
    def __init__(self, log_output_stream: io.StringIO, level=logging.INFO):
        self._logger = logging.getLogger("EdgeCrawlerLogger")
        self._logger.setLevel(level)
        if self._logger.hasHandlers():
            self._logger.handlers.clear()
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(
            logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
        )
        self._logger.addHandler(console_handler)
        gradio_handler = GradioLogHandler(log_output_stream)
        self._logger.addHandler(gradio_handler)

    def info(self, message: str):
        self._logger.info(message)

    def error(self, message: str):
        self._logger.error(message)

    def exception(self, message: str):
        self._logger.exception(message)


class VisitedUrlManager:
    def __init__(self):
        self.visited = set()

    def add(self, url: str):
        self.visited.add(url)

    def contains(self, url: str) -> bool:
        return url in self.visited

    def size(self) -> int:
        return len(self.visited)


class CrawlingStrategy(ABC):
    def __init__(self, visited_manager: VisitedUrlManager, logger: ILogger):
        self.visited = visited_manager
        self.logger = logger

    def add_links(self, links_info: list[tuple[str, int]]):
        new_links = [
            link_info
            for link_info in links_info
            if not self.visited.contains(link_info[0])
        ]
        for link_url, _ in new_links:
            self.visited.add(link_url)
        self._add_to_collection(new_links)

    def _add_to_collection(self, links):
        raise NotImplementedError

    def get_next(self) -> tuple[str, int]:
        raise NotImplementedError

    def has_next(self) -> bool:
        raise NotImplementedError

    def prime_with_frontier(self, frontier_urls_info: list[tuple[str, int]]):
        raise NotImplementedError

    def get_queue(self) -> list:
        raise NotImplementedError


class BFSCrawlingStrategy(CrawlingStrategy):
    def __init__(self, visited_manager: VisitedUrlManager, logger: ILogger):
        super().__init__(visited_manager, logger)
        self.queue = deque()

    def _add_to_collection(self, links):
        self.queue.extend(links)

    def get_next(self) -> tuple[str, int]:
        return self.queue.popleft()

    def has_next(self) -> bool:
        return len(self.queue) > 0

    def prime_with_frontier(self, frontier_urls_info: list[tuple[str, int]]):
        self.queue.extend(frontier_urls_info)

    def get_queue(self) -> list:
        return list(self.queue)


class DFSCrawlingStrategy(CrawlingStrategy):
    def __init__(self, visited_manager: VisitedUrlManager, logger: ILogger):
        super().__init__(visited_manager, logger)
        self.stack = []

    def _add_to_collection(self, links):
        self.stack.extend(links)

    def get_next(self) -> tuple[str, int]:
        return self.stack.pop()

    def has_next(self) -> bool:
        return len(self.stack) > 0

    def prime_with_frontier(self, frontier_urls_info: list[tuple[str, int]]):
        self.stack.extend(frontier_urls_info)

    def get_queue(self) -> list:
        return list(self.stack)


class StateManager:
    def __init__(self, db_path: str, logger: ILogger):
        self.db_path = db_path
        self.logger = logger
        db_dir = os.path.dirname(self.db_path)
        if db_dir:
            os.makedirs(db_dir, exist_ok=True)
        self._execute_query(
            "CREATE TABLE IF NOT EXISTS crawl_frontier (URL TEXT UNIQUE, Redirects INTEGER)"
        )

    def _execute_query(self, query: str, params=None, fetch=False):
        try:
            with sqlite3.connect(self.db_path) as conn:
                cursor = conn.cursor()
                cursor.execute(query, params or [])
                if fetch:
                    return cursor.fetchall()
                conn.commit()
        except sqlite3.Error as e:
            self.logger.error(f"StateManager DB error: {e}")
            return None

    def save_frontier(self, frontier_urls_info: list[tuple[str, int]]):
        self._execute_query("DELETE FROM crawl_frontier")
        if not frontier_urls_info:
            return
        try:
            with sqlite3.connect(self.db_path) as conn:
                conn.executemany(
                    "INSERT OR IGNORE INTO crawl_frontier (URL, Redirects) VALUES (?, ?)",
                    frontier_urls_info,
                )
        except sqlite3.Error as e:
            self.logger.error(f"Error saving frontier: {e}")

    def load_frontier(self) -> list[tuple[str, int]]:
        return (
            self._execute_query("SELECT URL, Redirects FROM crawl_frontier", fetch=True)
            or []
        )


class HttpClient:
    def __init__(self, config: CrawlerConfig, logger: ILogger):
        self.config = config
        self.logger = logger
        self.session = self._create_requests_session()

    def _create_requests_session(self):
        retry = Retry(
            total=self.config.max_retries_request,
            backoff_factor=1,
            status_forcelist=[500, 502, 503, 504],
        )
        adapter = HTTPAdapter(max_retries=retry)
        session = requests.Session()
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        return session

    def fetch(self, url: str) -> tuple[int, str, str | None]:
        headers = {"User-Agent": random.choice(self.config.user_agents)}
        try:
            time.sleep(self.config.min_request_delay)
            response = self.session.get(
                url,
                headers=headers,
                timeout=self.config.request_timeout,
                allow_redirects=False,
            )
            if 300 <= response.status_code < 400:
                return (
                    response.status_code,
                    "",
                    urljoin(url, response.headers.get("Location")),
                )
            return response.status_code, response.text, None
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Request error for {url}: {e}")
            return -2, f"Request Error: {e}", None


class UrlFilter:
    def __init__(self, crawling_scope_path: str, base_domain: str):
        self.crawling_scope_path = crawling_scope_path
        self.base_domain = base_domain
        self.file_extension_pattern = re.compile(
            r"\.(pdf|jpg|jpeg|png|gif|zip|rar|mp3|mp4|svg|xml|css|js|webp|ico)$",
            re.IGNORECASE,
        )

    def is_valid_for_crawling(self, url: str) -> bool:
        try:
            parsed_url = urlparse(url)
            return (
                parsed_url.scheme in ("http", "https")
                and parsed_url.netloc == self.base_domain
                and parsed_url.path.startswith(self.crawling_scope_path)
                and not self.file_extension_pattern.search(parsed_url.path)
            )
        except Exception:
            return False


class LinkExtractor:
    def __init__(self, url_filter: UrlFilter):
        self.url_filter = url_filter

    def normalize_url(self, url: str) -> str:
        """Strips ALL query parameters and fragments from a URL."""
        return urlparse(url)._replace(query="", fragment="").geturl()

    def extract_links(self, base_url: str, html_content: str) -> set[str]:
        links = set()
        soup = BeautifulSoup(html_content, "lxml")
        for a_tag in soup.find_all("a", href=True):
            absolute_link = urljoin(base_url, a_tag["href"])
            normalized_link = self.normalize_url(absolute_link)
            if self.url_filter.is_valid_for_crawling(normalized_link):
                links.add(normalized_link)
        return links


class EdgeCrawler:
    def __init__(
        self,
        config: CrawlerConfig,
        crawling_strategy: CrawlingStrategy,
        state_manager: StateManager,
        http_client: HttpClient,
        link_extractor: LinkExtractor,
        logger: ILogger,
    ):
        self.config = config
        self.crawling_strategy = crawling_strategy
        self.state_manager = state_manager
        self.http_client = http_client
        self.link_extractor = link_extractor
        self.logger = logger
        self.edge_buffer = []
        self.pages_processed_session = 0

    def _process_page_for_edges(self, from_url: str, num_redirects: int):
        if num_redirects >= self.config.max_redirects:
            self.logger.warning(f"Max redirects for {from_url}. Skipping.")
            return

        status, content, redirect_url = self.http_client.fetch(from_url)
        self.logger.info(f"Processed {from_url} [{status}]")

        if 200 <= status < 300 and content:
            linked_urls = self.link_extractor.extract_links(from_url, content)

            from_url_path = urlparse(from_url).path
            saving_scope = self.config.saving_scope_path

            if from_url_path.startswith(saving_scope):
                for to_url in linked_urls:
                    to_url_path = urlparse(to_url).path
                    if to_url_path.startswith(saving_scope):
                        self.edge_buffer.append({"FROM": from_url, "TO": to_url})

            self.crawling_strategy.add_links([(link, 0) for link in linked_urls])
            del content

        elif redirect_url:
            normalized_redirect = self.link_extractor.normalize_url(redirect_url)
            if self.link_extractor.url_filter.is_valid_for_crawling(
                normalized_redirect
            ):
                self.crawling_strategy.add_links(
                    [(normalized_redirect, num_redirects + 1)]
                )

    def _save_edges_to_csv(self):
        if not self.edge_buffer:
            return

        output_path = self.config.edge_list_path
        write_header = not os.path.exists(output_path)

        df = pd.DataFrame(self.edge_buffer)
        df.to_csv(output_path, mode="a", header=write_header, index=False)

        self.logger.info(
            f"✅ Saved a batch of {len(self.edge_buffer)} edges to {output_path}"
        )
        self.edge_buffer = []
        gc.collect()

    def crawl(self):
        pbar = tqdm(total=self.config.max_pages_to_crawl, desc="Processing Pages")
        while self.pages_processed_session < self.config.max_pages_to_crawl:
            if not self.crawling_strategy.has_next():
                self.logger.info("Frontier is empty. Stopping crawl.")
                break

            url_data = self.crawling_strategy.get_next()
            self._process_page_for_edges(url_data[0], url_data[1])
            self.pages_processed_session += 1
            pbar.update(1)

            if len(self.edge_buffer) >= self.config.save_interval_edges:
                self._save_edges_to_csv()
                self.state_manager.save_frontier(self.crawling_strategy.get_queue())

            yield f"Processed {self.pages_processed_session}/{self.config.max_pages_to_crawl} pages."

        pbar.close()
        self.logger.info("Crawl finished. Performing final save...")
        self._save_edges_to_csv()
        self.state_manager.save_frontier(self.crawling_strategy.get_queue())
        yield f"Crawl finished. Processed {self.pages_processed_session} pages in this session."


#
#
# --- Main Application Logic ---
#
#


def run_edge_crawler_interface(
    initial_start_url: str,
    crawling_scope_path: str,
    saving_scope_path: str,
    crawling_strategy_type: str,
    state_db_path_input: str,
    edge_list_path_input: str,
    max_pages_to_crawl: int,
):
    log_stream = io.StringIO()
    logger = ConsoleAndGradioLogger(log_stream)

    try:
        base_domain = urlparse(initial_start_url).netloc
        if not base_domain:
            raise ValueError("Invalid Initial Start URL.")

        config = CrawlerConfig(
            initial_start_url=initial_start_url,
            crawling_scope_path=crawling_scope_path,
            saving_scope_path=saving_scope_path,
            state_db_path=state_db_path_input,
            edge_list_path=edge_list_path_input,
            max_pages_to_crawl=max_pages_to_crawl,
            base_domain=base_domain,
        )
        os.makedirs(os.path.dirname(config.edge_list_path), exist_ok=True)
        yield "Initializing...", log_stream.getvalue(), ""

        state_manager = StateManager(config.state_db_path, logger)
        visited_manager = VisitedUrlManager()

        logger.info("Rebuilding visited set from existing edge list CSV...")
        try:
            if os.path.exists(config.edge_list_path):
                edge_df = pd.read_csv(config.edge_list_path, low_memory=False)
                all_urls_in_graph = set(
                    pd.concat([edge_df["FROM"], edge_df["TO"]]).unique()
                )
                for url in all_urls_in_graph:
                    visited_manager.add(url)
                logger.info(
                    f"Rebuilt visited set with {visited_manager.size()} URLs from CSV."
                )
        except Exception as e:
            logger.warning(
                f"Could not rebuild visited set from CSV (may be a new crawl): {e}"
            )

        if crawling_strategy_type == "BFS":
            crawling_strategy = BFSCrawlingStrategy(visited_manager, logger)
        else:
            crawling_strategy = DFSCrawlingStrategy(visited_manager, logger)

        loaded_frontier = state_manager.load_frontier()
        unvisited_frontier = [
            info for info in loaded_frontier if not visited_manager.contains(info[0])
        ]

        if unvisited_frontier:
            crawling_strategy.prime_with_frontier(unvisited_frontier)
        elif not visited_manager.contains(config.initial_start_url):
            crawling_strategy.add_links([(config.initial_start_url, 0)])

        url_filter = UrlFilter(config.crawling_scope_path, config.base_domain)
        link_extractor = LinkExtractor(url_filter)
        crawler = EdgeCrawler(
            config,
            crawling_strategy,
            state_manager,
            HttpClient(config, logger),
            link_extractor,
            logger,
        )

        final_status = ""
        for status_msg in crawler.crawl():
            final_status = status_msg
            yield status_msg, log_stream.getvalue(), ""

        logger.info("Generating final summary from CSV file...")
        summary_md = f"## Crawl Session Finished\n\n- **Status**: {final_status}\n- **Edge List Location**: `{config.edge_list_path}`"
        try:
            if os.path.exists(config.edge_list_path):
                edge_df = pd.read_csv(config.edge_list_path)
                num_edges = len(edge_df)
                num_nodes = len(pd.concat([edge_df["FROM"], edge_df["TO"]]).unique())
                summary_md += f"\n- **Total Unique Pages (Nodes):** {num_nodes}\n- **Total Links (Edges):** {num_edges}"
        except Exception as e:
            logger.error(f"Could not generate summary from CSV: {e}")
            summary_md += "\n\n**Could not generate summary from CSV file.**"

        yield final_status, log_stream.getvalue(), summary_md

    except Exception as e:
        logger.exception(f"A critical error occurred: {e}")
        yield "Crawl Failed!", log_stream.getvalue(), f"**Error:** {e}"


#
#
# --- Final Gradio UI ---
#
#
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🕸️ Link Graph Extractor")
    gr.Markdown(
        "This tool crawls a website to produce a simple `FROM, TO` list of all hyperlinks, saved as a CSV file."
    )

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## Configuration")
            initial_url_input = gr.Textbox(
                label="Initial Start URL", value=CrawlerConfig.initial_start_url
            )
            max_pages_input = gr.Number(
                label="Maximum Pages to Process",
                value=CrawlerConfig.max_pages_to_crawl,
                minimum=1,
                step=100,
            )
            crawling_strategy_radio = gr.Radio(
                choices=["BFS", "DFS"], label="Crawling Strategy", value="BFS"
            )

            gr.Markdown("### 📜 Scopes")
            crawling_scope_path_input = gr.Textbox(
                label="Crawling Scope Path",
                value=CrawlerConfig.crawling_scope_path,
                info="The 'playground'. Set to '/' to explore the entire site.",
            )
            saving_scope_path_input = gr.Textbox(
                label="Saving Scope Path",
                value=CrawlerConfig.saving_scope_path,
                info="The 'rulebook'. Only save links where FROM and TO are in this path.",
            )

            gr.Markdown("### 💾 Storage Paths")
            state_db_path_input = gr.Textbox(
                label="Crawl State DB Path (SQLite)", value=CrawlerConfig.state_db_path
            )
            edge_list_path_input = gr.Textbox(
                label="Output Edge List Path (CSV)", value=CrawlerConfig.edge_list_path
            )

            start_button = gr.Button("🚀 Start Extraction", variant="primary")

        with gr.Column(scale=2):
            gr.Markdown("## Status & Results")
            status_message_output = gr.Textbox(
                label="Status Message", interactive=False
            )
            logs_output = gr.Textbox(
                label="Crawler Logs", interactive=False, lines=15, max_lines=20
            )
            summary_output = gr.Markdown("---")

    start_button.click(
        fn=run_edge_crawler_interface,
        inputs=[
            initial_url_input,
            crawling_scope_path_input,
            saving_scope_path_input,
            crawling_strategy_radio,
            state_db_path_input,
            edge_list_path_input,
            max_pages_input,
        ],
        outputs=[status_message_output, logs_output, summary_output],
    )

demo.launch(debug=True)