In [None]:
# Installing necessary packages

!pip install selenium webdriver_manager
!pip install tqdm
!pip install polars
!pip install torch
!pip install gradio
from urllib.parse import urlparse, parse_qs

Collecting selenium
  Downloading selenium-4.33.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.13.2 (from selenium)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.33.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import gradio as gr
import sqlite3
import pandas as pd
import requests
import time
import logging
import random
import re
from urllib.parse import urlparse, urljoin, parse_qs
from collections import deque
from abc import ABC, abstractmethod
from tqdm import tqdm
import io # For capturing logs
from dataclasses import dataclass, field
import os # Added for path manipulation and existence checks
import shutil # Added for file copying

# --- Google Colab Specific Setup (REQUIRED for Google Drive access) ---
# IMPORTANT: You must run this cell in your Colab notebook first to mount your Google Drive.
# A popup will appear asking for authentication.
try:
    from google.colab import drive
    drive.mount('/content/drive/')
    print("Google Drive mounted successfully.")
except ImportError:
    print("Not running in Google Colab environment or google.colab not found. Skipping Google Drive mount.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}. Ensure you run this code in a Colab notebook and follow the authentication steps.")

# To prevent ReadTimeoutError (for requests library)
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


# --- Configuration Constants (Encapsulated in a dataclass) ---
@dataclass
class CrawlerConfig:
    # Default path is now set for Google Drive based on your requirement
    db_path: str = "/content/drive/My Drive/master_july_2025/data/db_data.db"
    table_name: str = 'crawled_data'
    min_request_delay: float = 1.0     # seconds
    max_request_delay: float = 30.0    # seconds
    max_pages_to_crawl: int = 700      # Max pages to attempt to crawl
    save_interval_pages: int = 10      # How often to save data to DB (in pages crawled)
    download_interval_pages: int = 100 # New: How often to make DB snapshot available for download (saved to /tmp)
    max_retries_request: int = 3       # Max retries for a single HTTP request
    max_redirects: int = 2             # Max redirects to follow for a single URL
    request_timeout: int = 15          # seconds for HTTP request timeout
    allowed_path_segment: str = "/blog/" # Specific path segment your crawler should stay within
    initial_start_url: str = 'https://example-website.com/' # Default start URL (can be overridden in UI)
    user_agents: list[str] = field(default_factory=lambda: [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/52.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:53.0) Gecko/20100101 Firefox/53.0',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'
    ])
    base_domain: str = "" # Field to store the base domain for strict filtering

# --- Abstract Base Classes for SOLID Principles ---

class ILogger(ABC):
    @abstractmethod
    def debug(self, message: str): pass
    @abstractmethod
    def info(self, message: str): pass
    @abstractmethod
    def warning(self, message: str): pass
    @abstractmethod
    def error(self, message: str): pass
    @abstractmethod
    def exception(self, message: str): pass

class GradioLogHandler(logging.Handler):
    def __init__(self, log_output_stream: io.StringIO):
        super().__init__()
        self.log_output_stream = log_output_stream
        self.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))

    def emit(self, record):
        log_entry = self.format(record)
        self.log_output_stream.write(log_entry + '\n')
        self.log_output_stream.flush()

class ConsoleAndGradioLogger(ILogger):
    def __init__(self, log_output_stream: io.StringIO, level=logging.DEBUG):
        self._logger = logging.getLogger("CrawlerLogger")
        self._logger.setLevel(level)
        for handler in self._logger.handlers[:]:
            self._logger.removeHandler(handler)
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        self._logger.addHandler(console_handler)
        gradio_handler = GradioLogHandler(log_output_stream)
        self._logger.addHandler(gradio_handler)

    def debug(self, message: str): self._logger.debug(message)
    def info(self, message: str): self._logger.info(message)
    def warning(self, message: str): self._logger.warning(message)
    def error(self, message: str): self._logger.error(message)
    def exception(self, message: str): self._logger.exception(message)

class VisitedUrlManager(ABC):
    @abstractmethod
    def add(self, url: str): pass
    @abstractmethod
    def contains(self, url: str) -> bool: pass
    @abstractmethod
    def clear(self): pass
    @abstractmethod
    def size(self) -> int: pass

class InMemoryVisitedUrlManager(VisitedUrlManager):
    def __init__(self): self.visited = set()
    def add(self, url: str): self.visited.add(url)
    def contains(self, url: str) -> bool: return url in self.visited
    def clear(self): self.visited.clear()
    def size(self) -> int: return len(self.visited)

class CrawlingStrategy(ABC):
    def __init__(self, start_url_info: tuple[str, int], visited_manager: VisitedUrlManager, logger: ILogger):
        self.visited = visited_manager
        self.logger = logger
        if not self.visited.contains(start_url_info[0]):
            self.visited.add(start_url_info[0])
            self.logger.debug(f"Added initial URL {start_url_info[0]} to visited manager.")

    @abstractmethod
    def add_links(self, links_info: list[tuple[str, int]]): pass
    @abstractmethod
    def get_next(self) -> tuple[str, int]: pass
    @abstractmethod
    def has_next(self) -> bool: pass
    @abstractmethod
    def count(self) -> int: pass

class BFSCrawlingStrategy(CrawlingStrategy):
    def __init__(self, start_url_info: tuple[str, int], visited_manager: VisitedUrlManager, logger: ILogger):
        super().__init__(start_url_info, visited_manager, logger)
        self.queue = deque([start_url_info])
        self.logger.debug(f"BFS strategy initialized with queue: {self.queue}")

    def add_links(self, links_info: list[tuple[str, int]]):
        new_links_to_add = []
        for link_url, link_redirects in links_info:
            if not self.visited.contains(link_url):
                self.visited.add(link_url)
                new_links_to_add.append((link_url, link_redirects))
        self.queue.extend(new_links_to_add)
        self.logger.debug(f"Added {len(new_links_to_add)} new links to BFS queue.")

    def get_next(self) -> tuple[str, int]:
        if not self.has_next(): raise IndexError("Queue is empty, cannot get next URL.")
        next_url = self.queue.popleft()
        self.logger.debug(f"Getting next from BFS: {next_url[0]}")
        return next_url

    def has_next(self) -> bool: return len(self.queue) > 0
    def count(self) -> int: return len(self.queue)

class DFSCrawlingStrategy(CrawlingStrategy):
    def __init__(self, start_url_info: tuple[str, int], visited_manager: VisitedUrlManager, logger: ILogger):
        super().__init__(start_url_info, visited_manager, logger)
        self.stack = [start_url_info]
        self.logger.debug(f"DFS strategy initialized with stack: {self.stack}")

    def add_links(self, links_info: list[tuple[str, int]]):
        new_links_to_add = []
        for link_url, link_redirects in links_info:
            if not self.visited.contains(link_url):
                self.visited.add(link_url)
                new_links_to_add.append((link_url, link_redirects))
        self.stack.extend(new_links_to_add)
        self.logger.debug(f"Added {len(new_links_to_add)} new links to DFS stack.")

    def get_next(self) -> tuple[str, int]:
        if not self.has_next(): raise IndexError("Stack is empty, cannot get next URL.")
        next_url = self.stack.pop()
        self.logger.debug(f"Getting next from DFS: {next_url[0]}")
        return next_url

    def has_next(self) -> bool: return len(self.stack) > 0
    def count(self) -> int: return len(self.stack)

class DatabaseManager:
    # Updated constructor to accept base_domain
    def __init__(self, db_path: str, table_name: str, logger: ILogger, base_domain: str = ""):
        self.db_path = db_path
        self.table_name = table_name
        self.logger = logger
        self.base_domain = base_domain # Store base_domain for filtering
        self._ensure_db_directory_exists()

    def _ensure_db_directory_exists(self):
        db_dir = os.path.dirname(self.db_path)
        if db_dir and not os.path.exists(db_dir):
            try:
                os.makedirs(db_dir, exist_ok=True)
                self.logger.info(f"Created database directory: {db_dir}")
            except OSError as e:
                self.logger.error(f"Error creating database directory {db_dir}: {e}")
                raise

    def _execute_query(self, query_to_execute: str, fetch_results: bool = False) -> list | None:
        db_connection = None
        try:
            db_connection = sqlite3.connect(self.db_path)
            cursor = db_connection.cursor()
            self.logger.debug(f"Executing SQL query: {query_to_execute}")
            cursor.execute(query_to_execute)
            if fetch_results:
                result = cursor.fetchall()
                return result
            else:
                db_connection.commit()
                return None
        except sqlite3.Error as e:
            self.logger.error(f"Database error in _execute_query for '{self.db_path}': {e}")
            if db_connection: db_connection.rollback()
            return None
        finally:
            if db_connection: db_connection.close()

    def ensure_table_exists(self):
        query = f"CREATE TABLE IF NOT EXISTS {self.table_name} (URL TEXT UNIQUE, Status_Code INTEGER, Content TEXT)"
        result = self._execute_query(query_to_execute=query)
        if result is None: self.logger.info(f"Table '{self.table_name}' ensured to exist in '{self.db_path}'.")
        else: self.logger.error(f"Failed to ensure table '{self.table_name}' exists.")

    def write_dataframe(self, df: pd.DataFrame, chunksize: int = None) -> bool:
        conn = None
        try:
            conn = sqlite3.connect(self.db_path)
            df.to_sql(name=self.table_name, con=conn, if_exists='append', index=False, chunksize=chunksize)
            conn.commit()
            self.logger.info(f"Successfully wrote {len(df)} rows to table '{self.table_name}'.")
            return True
        except sqlite3.Error as e:
            self.logger.error(f"Database error during write_dataframe for '{self.db_path}': {e}")
            if conn: conn.rollback()
            return False
        except Exception as e:
            self.logger.error(f"An unexpected error occurred during write_dataframe for '{self.db_path}': {e}")
            if conn: conn.rollback()
            return False
        finally:
            if conn: conn.close()

    def count_rows(self) -> int:
        query = f"SELECT COUNT(*) FROM {self.table_name}"
        result = self._execute_query(query, fetch_results=True)
        if result and len(result) > 0 and len(result[0]) > 0:
            row_count = result[0][0]
            self.logger.info(f"Counted {row_count} rows in table '{self.table_name}'.")
            return row_count
        else:
            self.logger.error(f"Could not retrieve row count for table '{self.table_name}'. Result: {result}")
            return 0

    def get_all_urls(self) -> tuple[set[str], str | None]:
        """
        Fetches all URLs from the specified table, filters them by base_domain,
        returns them as a set, and also returns the last *valid* URL inserted.
        """
        query = f"SELECT URL FROM {self.table_name}"
        all_urls_data = self._execute_query(query, fetch_results=True)

        filtered_url_strings = []
        last_valid_url = None

        if all_urls_data:
            for url_tuple in all_urls_data:
                url_string = url_tuple[0]
                # Apply the base_domain filter when loading from DB
                parsed_url = urlparse(url_string)
                if parsed_url.netloc == self.base_domain:
                    filtered_url_strings.append(url_string)
                    last_valid_url = url_string # Keep track of the last valid URL encountered

            self.logger.info(f"Fetched {len(all_urls_data)} URLs from DB. Loaded {len(filtered_url_strings)} URLs for current domain '{self.base_domain}'. Last valid URL: {last_valid_url}")
            return set(filtered_url_strings), last_valid_url
        else:
            self.logger.info("No URLs found in the database.")
            return set(), None

class HttpClient:
    def __init__(self, config: CrawlerConfig, logger: ILogger):
        self.config = config
        self.logger = logger
        self.session = self._create_requests_session()
        self.current_delay = self.config.min_request_delay

    def _create_requests_session(self):
        retry_strategy = Retry(
            total=self.config.max_retries_request,
            backoff_factor=1,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "PUT", "DELETE", "OPTIONS", "TRACE"]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session = requests.Session()
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        session.timeout = self.config.request_timeout
        self.logger.debug("Requests session created with retry strategy.")
        return session

    def adjust_delay(self, response_time: float):
        if response_time < 2: self.current_delay = max(self.config.min_request_delay, self.current_delay * 0.9)
        elif response_time > 5: self.current_delay = min(self.config.max_request_delay, self.current_delay * 1.1)
        self.logger.debug(f"Adjusted delay to: {self.current_delay:.2f}s (Response time: {response_time:.2f}s)")

    def fetch(self, url: str) -> tuple[int, str, str | None, float]:
        headers = {'User-Agent': random.choice(self.config.user_agents)}
        response_status_code = 0; response_content = ""; redirect_url = None; response_time = 0.0
        try:
            time.sleep(self.current_delay)
            start_time = time.time()
            response = self.session.get(url, headers=headers, allow_redirects=False)
            response_time = time.time() - start_time
            self.adjust_delay(response_time)
            response_status_code = response.status_code
            self.logger.debug(f"Request to {url} completed in {response_time:.2f}s with status {response_status_code}")
            if 200 <= response_status_code < 300: response_content = response.text
            elif 300 <= response_status_code < 400:
                redirect_url = response.headers.get('Location')
                if redirect_url and not urlparse(redirect_url).scheme: redirect_url = urljoin(url, redirect_url)
                self.logger.info(f"Redirect from {url} to: {redirect_url} (Status: {response_status_code})")
            else: self.logger.warning(f"Failed to fetch {url} (Status: {response_status_code})")
            return response_status_code, response_content, redirect_url, response_time
        except requests.exceptions.RetryError as e:
            self.logger.error(f"Max retries exceeded for {url}: {e}"); return -4, f"Max retries exceeded: {e}", None, 0.0
        except requests.exceptions.Timeout:
            self.logger.error(f"Timeout occurred for {url}."); return -1, "Timeout", None, 0.0
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Persistent request error for {url}: {e}"); return -2, f"Persistent Request Error: {e}", None, 0.0
        except Exception as e:
            self.logger.error(f"An unexpected error occurred processing {url}: {e}"); return -3, f"Unexpected Error: {e}", None, 0.0

class UrlFilter:
    def __init__(self, allowed_path_segment: str, base_domain: str, logger: ILogger):
        self.allowed_path_segment = allowed_path_segment
        self.base_domain = base_domain
        self.logger = logger
        self.file_extension_pattern = re.compile(r'\.(jpg|jpeg|png|gif|pdf|doc|xls|zip|rar|mp3|mp4)$', re.IGNORECASE)
        self.user_profile_pattern = re.compile(r'/user/[a-zA-Z0-9_-]+/?$')

    def is_valid(self, url: str) -> bool:
        parsed_url = urlparse(url); query_params = parse_qs(parsed_url.query)
        has_utm_params = any(param.startswith('utm_') for param in query_params)
        is_valid_check = (
            parsed_url.scheme in ('http', 'https') and
            parsed_url.netloc == self.base_domain and
            not self.file_extension_pattern.search(parsed_url.path) and
            not self.user_profile_pattern.search(parsed_url.path) and
            not parsed_url.fragment and
            'https://twitter.com' not in url and
            self.allowed_path_segment in parsed_url.path and
            not has_utm_params
        )
        if not is_valid_check:
            self.logger.debug(f"Filtered out invalid URL: {url} (Reason: does not match base domain '{self.base_domain}', or other filter rules)")
        return is_valid_check

class LinkExtractor:
    def __init__(self, url_filter: UrlFilter, logger: ILogger):
        self.url_filter = url_filter
        self.logger = logger

    def extract_links(self, base_url: str, html_content: str) -> list[str]:
        extracted_links = set()
        for match in re.finditer(r'href=["\'](.*?)["\']', html_content):
            link = match.group(1)
            absolute_link = urljoin(base_url, link)
            if self.url_filter.is_valid(absolute_link):
                extracted_links.add(absolute_link)
        self.logger.debug(f"Extracted {len(extracted_links)} valid links from {base_url}")
        return list(extracted_links)

class WebCrawler:
    def __init__(self, config: CrawlerConfig, crawling_strategy: CrawlingStrategy, db_manager: DatabaseManager, http_client: HttpClient, url_filter: UrlFilter, link_extractor: LinkExtractor, logger: ILogger):
        self.config = config; self.crawling_strategy = crawling_strategy; self.db_manager = db_manager; self.http_client = http_client; self.url_filter = url_filter; self.link_extractor = link_extractor; self.logger = logger
        self.data_buffer = []; self.pages_crawled = 0
        self._initialize_crawled_count()

    def _initialize_crawled_count(self):
        initial_db_rows_count = self.db_manager.count_rows()
        if initial_db_rows_count >= 0: self.pages_crawled = initial_db_rows_count; self.logger.info(f"Initialized pages crawled from DB: {self.pages_crawled}")
        else: self.logger.warning("Could not retrieve initial row count from DB. Starting pages_crawled from 0."); self.pages_crawled = 0

    def _process_url(self, url_info: tuple[str, int]) -> list[tuple[str, int]]:
        url, num_redirects = url_info
        if num_redirects >= self.config.max_redirects:
            self.logger.warning(f"Reached max redirects ({self.config.max_redirects}). Skipping: {url}")
            self.data_buffer.append({'URL': url, 'Status_Code': 999, 'Content': "Max redirects reached"})
            return []
        status_code, content, redirect_url, _ = self.http_client.fetch(url)
        self.data_buffer.append({'URL': url, 'Status_Code': status_code, 'Content': content if 200 <= status_code < 300 else ""})
        if 200 <= status_code < 300: return [[link, 0] for link in self.link_extractor.extract_links(url, content)]
        elif 300 <= status_code < 400 and redirect_url: return [[redirect_url, num_redirects + 1]]
        else: return []

    def _save_buffer(self, current_pages_crawled: int = 0):
        if self.data_buffer:
            df = pd.DataFrame(self.data_buffer)
            success = self.db_manager.write_dataframe(df, chunksize=self.config.save_interval_pages)
            if success:
                self.data_buffer = []
                if current_pages_crawled > 0 and current_pages_crawled % self.config.download_interval_pages == 0:
                    checkpoint_filename = f"db_data_checkpoint_{current_pages_crawled}.db"
                    checkpoint_path = os.path.join("/tmp", checkpoint_filename)
                    try:
                        os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
                        shutil.copyfile(self.config.db_path, checkpoint_path)
                        self.logger.info(f"Database checkpoint saved to: {checkpoint_path}")
                        return (checkpoint_path, f"Checkpoint DB ({current_pages_crawled} URLs)")
                    except Exception as e:
                        self.logger.error(f"Failed to save database checkpoint {checkpoint_path}: {e}")
                        return None
            else: self.logger.error("Failed to save data to DB. Data buffer retained to prevent loss.")
        return None

    def crawl(self, gr_progress: gr.Progress = None, download_checkpoint_callback=None):
        pbar_initial = self.pages_crawled; pbar_total = max(self.config.max_pages_to_crawl, pbar_initial + 1)
        if gr_progress: gr_progress.__enter__()
        self.logger.info("Starting crawl...")
        yield "Starting crawl...", self.logger._logger.handlers[-1].log_output_stream.getvalue()
        if self.pages_crawled > 0:
            self.logger.info(f"Resuming crawl. Starting with {self.pages_crawled} pages already in DB.")
            yield f"Resuming crawl from {self.pages_crawled} pages...", self.logger._logger.handlers[-1].log_output_stream.getvalue()

        with tqdm(total=pbar_total, initial=pbar_initial, desc="Crawling progress", unit="page", position=0, leave=True) as pbar:
            while self.pages_crawled < self.config.max_pages_to_crawl and self.crawling_strategy.has_next():
                current_url_data = self.crawling_strategy.get_next()
                current_url_string = current_url_data[0]
                self.logger.debug(f"Attempting to process: {current_url_string} (Queue/Stack size: {self.crawling_strategy.count()})")
                yield f"Processing URL: {current_url_string}", self.logger._logger.handlers[-1].log_output_stream.getvalue()
                new_links_with_redirect_info = self._process_url(current_url_data)
                self.crawling_strategy.add_links(new_links_with_redirect_info)

                if current_url_data[1] == 0:
                    self.pages_crawled += 1; pbar.update(1)
                    if gr_progress: gr_progress(self.pages_crawled / pbar_total, f"Crawled: {self.pages_crawled}/{self.config.max_pages_to_crawl}")

                if len(self.data_buffer) >= self.config.save_interval_pages:
                    self.logger.info("Saving data to database...")
                    yield "Saving data to database...", self.logger._logger.handlers[-1].log_output_stream.getvalue()
                    download_event_info = self._save_buffer(self.pages_crawled)
                    if download_event_info and download_checkpoint_callback:
                        download_checkpoint_callback(download_event_info)
                        self.logger.info(f"Checkpoint DB available: {download_event_info[1]}")
                        yield f"Checkpoint DB available: {download_event_info[1]}", self.logger._logger.handlers[-1].log_output_stream.getvalue()
                    yield "Data saved. Continuing crawl...", self.logger._logger.handlers[-1].log_output_stream.getvalue()

        if self.data_buffer:
            self.logger.info("Crawling finished. Saving remaining data...")
            yield "Final save of remaining data...", self.logger._logger.handlers[-1].log_output_stream.getvalue()
            final_download_event_info = self._save_buffer(self.pages_crawled)
            if final_download_event_info and download_checkpoint_callback:
                download_checkpoint_callback(final_download_event_info)
                self.logger.info(f"Final Checkpoint DB available: {final_download_event_info[1]}")
                yield f"Final Checkpoint DB available: {final_download_event_info[1]}", self.logger._logger.getLogger().handlers[-1].log_output_stream.getvalue() # Fixed handler access

        final_status = f"Crawl finished. Total pages processed: {self.pages_crawled}. Remaining queue/stack size: {self.crawling_strategy.count()}."
        self.logger.info(final_status)
        yield final_status, self.logger._logger.handlers[-1].log_output_stream.getvalue()

        if gr_progress: gr_progress.__exit__(None, None, None)

def run_gradio_crawler_interface(initial_start_url: str, allowed_path_segment: str, crawling_strategy_type: str, db_path_input: str, max_pages_to_crawl: int, gr_progress: gr.Progress):
    log_stream = io.StringIO(); logger = ConsoleAndGradioLogger(log_stream, level=logging.DEBUG)
    try:
        parsed_initial_url = urlparse(initial_start_url)
        if not parsed_initial_url.netloc: raise ValueError("Initial Start URL must be a valid absolute URL (e.g., https://example.com).")
        base_domain = parsed_initial_url.netloc
    except ValueError as e:
        logger.error(f"Invalid Initial Start URL: {e}"); yield f"Crawl failed: Invalid Initial Start URL. Error: {e}", log_stream.getvalue(), ""; return
    except Exception as e:
        logger.error(f"An unexpected error occurred parsing Initial Start URL: {e}"); yield f"Crawl failed: Error parsing Initial Start URL. Error: {e}", log_stream.getvalue(), ""; return

    config = CrawlerConfig(initial_start_url=initial_start_url, allowed_path_segment=allowed_path_segment, db_path=db_path_input, max_pages_to_crawl=max_pages_to_crawl, base_domain=base_domain)
    all_downloadable_files = []

    def download_checkpoint_callback(file_info: tuple[str, str]): all_downloadable_files.append(file_info)

    yield "Initializing crawler components...", log_stream.getvalue(), ""

    try:
        # Pass base_domain to DatabaseManager
        db_manager = DatabaseManager(config.db_path, config.table_name, logger, base_domain=config.base_domain)
        db_manager.ensure_table_exists()
        yield "Database and table ensured to exist.", log_stream.getvalue(), ""

        existing_visited_urls, last_crawled_url = db_manager.get_all_urls()
        visited_manager = InMemoryVisitedUrlManager()
        for url in existing_visited_urls: visited_manager.add(url)
        logger.info(f"Loaded {len(existing_visited_urls)} existing URLs into visited manager.")

        strategy_initial_url = last_crawled_url if last_crawled_url else config.initial_start_url
        strategy_initial_url_info = (strategy_initial_url, 0)
        logger.info(f"Effective start URL for strategy: {strategy_initial_url_info[0]}")

        crawling_strategy_instance = None
        if crawling_strategy_type == 'BFS': crawling_strategy_instance = BFSCrawlingStrategy(strategy_initial_url_info, visited_manager, logger)
        elif crawling_strategy_type == 'DFS': crawling_strategy_instance = DFSCrawlingStrategy(strategy_initial_url_info, visited_manager, logger)
        else: logger.warning("Invalid crawling strategy selected. Defaulting to BFS."); crawling_strategy_instance = BFSCrawlingStrategy(strategy_initial_url_info, visited_manager, logger)

        yield "Crawling strategy initialized.", log_stream.getvalue(), ""

        http_client = HttpClient(config, logger)
        url_filter = UrlFilter(config.allowed_path_segment, config.base_domain, logger)
        link_extractor = LinkExtractor(url_filter, logger)

        crawler = WebCrawler(config=config, crawling_strategy=crawling_strategy_instance, db_manager=db_manager, http_client=http_client, url_filter=url_filter, link_extractor=link_extractor, logger=logger)

        for status_msg, current_logs in crawler.crawl(gr_progress=gr_progress, download_checkpoint_callback=download_checkpoint_callback):
            yield status_msg, current_logs, ""

        final_status = f"Crawl finished. Total pages processed: {crawler.pages_crawled}. Remaining queue/stack size: {crawler.crawling_strategy.count()}."
        logger.info(final_status)

        final_summary_and_downloads_markdown = ""
        try:
            if not os.path.exists(config.db_path):
                final_summary_and_downloads_markdown += f"**Warning:** Database file not found at: `{config.db_path}`. No summary available.\n\n"
            else:
                if (config.db_path, "Final Database (.db)") not in all_downloadable_files: all_downloadable_files.append((config.db_path, "Final Database (.db)"))
                db_manager_summary = DatabaseManager(config.db_path, config.table_name, ConsoleAndGradioLogger(io.StringIO()))
                total_rows = db_manager_summary.count_rows()
                unique_urls, _ = db_manager_summary.get_all_urls() # This now respects the domain filter

                final_summary_and_downloads_markdown += "## Final Crawl Summary\n"
                final_summary_and_downloads_markdown += f"Total unique URLs in DB (for this domain): **{len(unique_urls)}**\n" # Clarified unique count
                final_summary_and_downloads_markdown += f"Total entries in DB (including redirects/errors): **{total_rows}**\n\n" # Total rows might still include external if not explicitly cleared

                with sqlite3.connect(config.db_path) as conn: df_crawled_data = pd.read_sql(f"SELECT * FROM {config.table_name}", conn)

                # Filter DataFrame for CSV download to only include current domain for cleanliness
                df_crawled_data_filtered = df_crawled_data[df_crawled_data['URL'].apply(lambda x: urlparse(x).netloc == config.base_domain)]

                csv_filename = "crawled_data_summary.csv"; csv_filepath = os.path.join("/tmp", csv_filename)
                df_crawled_data_filtered.to_csv(csv_filepath, index=False, encoding='utf-8')
                if (csv_filepath, "All Crawled Data (CSV)") not in all_downloadable_files: all_downloadable_files.append((csv_filepath, "All Crawled Data (CSV)"))

        except Exception as e:
            error_msg = f"An error occurred while preparing summary or download data: {e}"
            logger.exception(error_msg)
            final_summary_and_downloads_markdown += f"**Error:** {error_msg}\n\n"

        final_summary_and_downloads_markdown += "## Download Available Data\n"
        if not all_downloadable_files: final_summary_and_downloads_markdown += "No downloadable files available.\n"
        else:
            for file_path, display_name in all_downloadable_files:
                final_summary_and_downloads_markdown += (
                    f"- **{display_name}:** `{file_path}` "
                    f"(Right-click on the path in the logs and select 'Download' to save, or navigate to it in your Google Drive 'Files' section.)\n"
                )

        yield final_status, log_stream.getvalue(), final_summary_and_downloads_markdown

    except Exception as e:
        error_msg = f"An unhandled error occurred during crawler setup or execution: {e}"
        logger.exception(error_msg)
        yield "Crawl failed!", log_stream.getvalue(), ""
    finally:
        for handler in logging.getLogger("CrawlerLogger").handlers[:]:
            if isinstance(handler, ConsoleAndGradioLogger): continue
            logging.getLogger("CrawlerLogger").removeHandler(handler)

with gr.Blocks() as demo:
    gr.Markdown("# 🕸️ Advanced Configurable Web Crawler")
    gr.Markdown("Configure your crawling parameters below. The crawler follows Breadth-First Search (BFS) or Depth-First Search (DFS) and persists data to an SQLite database.")
    gr.Markdown("---")

    with gr.Row():
        with gr.Column():
            gr.Markdown("## Crawler Configuration")
            initial_url_input = gr.Textbox(
                label="Initial Start URL",
                value=CrawlerConfig().initial_start_url,
                placeholder="e.g., https://example.com/",
                info="The **absolute URL** where the crawler will begin. E.g., `https://example.com/`."
            )
            allowed_path_input = gr.Textbox(
                label="Allowed Path Segment",
                value=CrawlerConfig().allowed_path_segment,
                placeholder="e.g., /my-website-section/",
                info="Only URLs that contain this path segment **AND** belong to the same domain as the Initial Start URL will be crawled. Ensure it starts and ends with a '/' if it's a directory (e.g., `/blog/`)."
            )
            crawling_strategy_radio = gr.Radio(
                choices=['BFS', 'DFS'],
                label="Crawling Strategy",
                value='BFS',
                info=(
                    "**BFS (Breadth-First Search):** Explores all pages at the current 'depth' level before moving to the next. "
                    "Think of it as expanding outwards in concentric circles. **Good for wide, shallow crawls** "
                    "(e.g., discovering many pages across the entire site quickly). "
                    "\n\n"
                    "**DFS (Depth-First Search):** Explores as far as possible down one 'branch' or path before backtracking. "
                    "Think of it as diving deep into a specific folder or section. **Good for narrow, deep crawls** "
                    "(e.g., exploring all pages within a particular folder or sub-section of a site)."
                )
            )
            db_path_input_field = gr.Textbox(
                label="Database Path",
                value=CrawlerConfig().db_path,
                placeholder="e.g., /content/drive/My Drive/my_crawler_data.db",
                info="**REQUIRED for Google Colab:** This must be a path within your **mounted Google Drive** (e.g., `/content/drive/My Drive/your_folder/database.db`). Ensure Google Drive is mounted before running."
            )
            max_pages_input = gr.Number(
                label="Maximum Pages to Crawl",
                value=CrawlerConfig().max_pages_to_crawl,
                minimum=1,
                step=100,
                info="The total number of unique pages the crawler will attempt to process before stopping. This includes pages fetched successfully, those resulting in errors, or redirects."
            )

        with gr.Column():
            gr.Markdown("## Actions and Status")
            start_button = gr.Button("🚀 Start Crawl", variant="primary")
            gr.Markdown("---")

            status_message_output = gr.Textbox(label="Status Message", interactive=False)
            logs_output = gr.Textbox(label="Crawler Logs", interactive=False, lines=20)
            download_markdown_output = gr.Markdown("---") # Placeholder for download links

    start_button.click(
        fn=run_gradio_crawler_interface,
        inputs=[
            initial_url_input,
            allowed_path_input,
            crawling_strategy_radio,
            db_path_input_field,
            max_pages_input
        ],
        outputs=[
            status_message_output,
            logs_output,
            download_markdown_output # Output for download links
        ],
        show_progress='full' # Show full progress indicator
    )

demo.launch(debug=True)


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Google Drive mounted successfully.




It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://0c3987e30d7aa7cc30.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
DEBUG:CrawlerLogger:Filtered out invalid URL: https://smartclick.agency/about-us/ (Reason: does not match base domain 'smartclick.agency', or other filter rules)
2025-06-12 20:48:03,516 - DEBUG - Filtered out invalid URL: https://smartclick.agency/case-studies/ (Reason: does not match base domain 'smartclick.agency', or other filter rules)
DEBUG:CrawlerLogger:Filtered out invalid URL: https://smartclick.agency/case-studies/ (Reason: does not match base domain 'smartclick.agency', or other filter rules)
2025-06-12 20:48:03,518 - DEBUG - Filtered out invalid URL: https://smartclick.agency/careers/ (Reason: does not match base domain 'smartclick.agency', or other filter rules)
DEBUG:CrawlerLogger:Filtered out invalid URL: https://smartclick.agency/careers/ (Reason: does not match base domain 'smartclick.agency', or other filter rules)
2025-06-12 20:48:03,520 - DEBUG - Filtered out invalid URL: https://smartclick.agency/servi