# Always pefrorm Runtime -> Restart session

In [1]:
# Installing necessary packages
!pip install pandas pyarrow duckdb gradio requests beautifulsoup4 lxml tqdm
from urllib.parse import urlparse, parse_qs



In [1]:
#
#
# --- Imports and Setup ---
#
#

import gradio as gr
import sqlite3
import pandas as pd
import requests
import time
import logging
import random
import re
import io
import os
import gc
import duckdb
from datetime import datetime
from urllib.parse import urlparse, urljoin, parse_qs
from collections import deque
from abc import ABC, abstractmethod
from tqdm import tqdm
from dataclasses import dataclass, field
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

#
#
# --- Google Colab Drive Mount ---
#
#

try:
    from google.colab import drive
    drive.mount('/content/drive/')
    print("Google Drive mounted successfully.")
except ImportError:
    print("Not running in Google Colab environment. Skipping Google Drive mount.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")

#
#
# --- Configuration ---
#
#

@dataclass
class CrawlerConfig:
    state_db_path: str = "/content/drive/My Drive/master_july_2025/data/crawler_state.db"
    parquet_path: str = "/content/drive/My Drive/master_july_2025/data/crawled_data_parquet/"
    min_request_delay: float = 1.0
    max_request_delay: float = 30.0
    max_pages_to_crawl: int = 700
    save_interval_pages: int = 10
    max_retries_request: int = 3
    max_redirects: int = 2
    request_timeout: int = 15
    allowed_path_segment: str = "/section"
    initial_start_url: str = 'https://example-url.com/section'
    user_agents: list[str] = field(default_factory=lambda: [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
    ])
    base_domain: str = ""

#
#
# --- Logging and Core Interfaces ---
#
#
class ILogger(ABC):
    @abstractmethod
    def debug(self, message: str): pass
    @abstractmethod
    def info(self, message: str): pass
    @abstractmethod
    def warning(self, message: str): pass
    @abstractmethod
    def error(self, message: str): pass
    @abstractmethod
    def exception(self, message: str): pass

class GradioLogHandler(logging.Handler):
    def __init__(self, log_output_stream: io.StringIO):
        super().__init__()
        self.log_output_stream = log_output_stream
        self.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))

    def emit(self, record):
        log_entry = self.format(record)
        self.log_output_stream.write(log_entry + '\n')
        self.log_output_stream.flush()

class ConsoleAndGradioLogger(ILogger):
    def __init__(self, log_output_stream: io.StringIO, level=logging.INFO):
        self._logger = logging.getLogger("CrawlerLogger")
        self._logger.setLevel(level)
        if self._logger.hasHandlers():
            self._logger.handlers.clear()
        console_handler = logging.StreamHandler()
        console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
        self._logger.addHandler(console_handler)
        gradio_handler = GradioLogHandler(log_output_stream)
        self._logger.addHandler(gradio_handler)

    def debug(self, message: str): self._logger.debug(message)
    def info(self, message: str): self._logger.info(message)
    def warning(self, message: str): self._logger.warning(message)
    def error(self, message: str): self._logger.error(message)
    def exception(self, message: str): self._logger.exception(message)

class VisitedUrlManager:
    def __init__(self): self.visited = set()
    def add(self, url: str): self.visited.add(url)
    def contains(self, url: str) -> bool: return url in self.visited
    def size(self) -> int: return len(self.visited)

class CrawlingStrategy(ABC):
    def __init__(self, visited_manager: VisitedUrlManager, logger: ILogger):
        self.visited = visited_manager
        self.logger = logger
    @abstractmethod
    def add_links(self, links_info: list[tuple[str, int]]): pass
    @abstractmethod
    def get_next(self) -> tuple[str, int]: pass
    @abstractmethod
    def has_next(self) -> bool: pass
    @abstractmethod
    def prime_with_frontier(self, frontier_urls_info: list[tuple[str, int]]): pass

class BFSCrawlingStrategy(CrawlingStrategy):
    def __init__(self, visited_manager: VisitedUrlManager, logger: ILogger):
        super().__init__(visited_manager, logger)
        self.queue = deque()
    def add_links(self, links_info: list[tuple[str, int]]):
        new_links = [link_info for link_info in links_info if not self.visited.contains(link_info[0])]
        for link_url, _ in new_links: self.visited.add(link_url)
        self.queue.extend(new_links)
    def get_next(self) -> tuple[str, int]: return self.queue.popleft()
    def has_next(self) -> bool: return len(self.queue) > 0
    def prime_with_frontier(self, frontier_urls_info: list[tuple[str, int]]): self.queue.extend(frontier_urls_info)

class DFSCrawlingStrategy(CrawlingStrategy):
    def __init__(self, visited_manager: VisitedUrlManager, logger: ILogger):
        super().__init__(visited_manager, logger)
        self.stack = []
    def add_links(self, links_info: list[tuple[str, int]]):
        new_links = [link_info for link_info in links_info if not self.visited.contains(link_info[0])]
        for link_url, _ in new_links: self.visited.add(link_url)
        self.stack.extend(new_links)
    def get_next(self) -> tuple[str, int]: return self.stack.pop()
    def has_next(self) -> bool: return len(self.stack) > 0
    def prime_with_frontier(self, frontier_urls_info: list[tuple[str, int]]): self.stack.extend(frontier_urls_info)


class StateManager:
    def __init__(self, db_path: str, logger: ILogger):
        self.db_path = db_path
        self.logger = logger
        db_dir = os.path.dirname(self.db_path)
        if db_dir: os.makedirs(db_dir, exist_ok=True)
        self.ensure_frontier_table_exists()

    def _execute_query(self, query: str, params=None, fetch=False):
        try:
            with sqlite3.connect(self.db_path) as conn:
                cursor = conn.cursor()
                cursor.execute(query, params or [])
                if fetch: return cursor.fetchall()
                conn.commit()
        except sqlite3.Error as e:
            self.logger.error(f"StateManager DB error: {e}")
            return None

    def ensure_frontier_table_exists(self):
        self._execute_query("CREATE TABLE IF NOT EXISTS crawl_frontier (URL TEXT UNIQUE, Redirects INTEGER)")
        self.logger.info("Crawl frontier table ensured to exist.")

    def save_frontier(self, frontier_urls_info: list[tuple[str, int]]):
        self._execute_query("DELETE FROM crawl_frontier")
        if not frontier_urls_info: return
        try:
            with sqlite3.connect(self.db_path) as conn:
                conn.executemany("INSERT OR IGNORE INTO crawl_frontier (URL, Redirects) VALUES (?, ?)", frontier_urls_info)
                conn.commit()
            self.logger.info(f"Saved {len(frontier_urls_info)} URLs to frontier.")
        except sqlite3.Error as e:
            self.logger.error(f"Error saving frontier: {e}")

    def load_frontier(self) -> list[tuple[str, int]]:
        result = self._execute_query("SELECT URL, Redirects FROM crawl_frontier", fetch=True)
        return result or []

class HttpClient:
    def __init__(self, config: CrawlerConfig, logger: ILogger):
        self.config = config
        self.logger = logger
        self.session = self._create_requests_session()
        self.current_delay = self.config.min_request_delay

    def _create_requests_session(self):
        retry = Retry(total=self.config.max_retries_request, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
        adapter = HTTPAdapter(max_retries=retry)
        session = requests.Session()
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        return session

    def fetch(self, url: str) -> tuple[int, str, str | None]:
        headers = {'User-Agent': random.choice(self.config.user_agents)}
        try:
            time.sleep(self.current_delay)
            response = self.session.get(url, headers=headers, timeout=self.config.request_timeout, allow_redirects=False)
            if 300 <= response.status_code < 400:
                redirect_url = response.headers.get('Location')
                return response.status_code, "", urljoin(url, redirect_url)
            return response.status_code, response.text, None
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Request error for {url}: {e}")
            return -2, f"Request Error: {e}", None

class UrlFilter:
    def __init__(self, allowed_path_segment: str, base_domain: str):
        self.allowed_path_segment = allowed_path_segment
        self.base_domain = base_domain
        self.file_extension_pattern = re.compile(r'\.(jpg|jpeg|png|gif|pdf|doc|xls|zip|rar|mp3|mp4)$', re.I)

    def is_valid(self, url: str) -> bool:
        try:
            parsed_url = urlparse(url)
            return (parsed_url.scheme in ('http', 'https') and
                    parsed_url.netloc == self.base_domain and
                    self.allowed_path_segment in parsed_url.path and
                    not self.file_extension_pattern.search(parsed_url.path))
        except Exception:
            return False

class LinkExtractor:
    def __init__(self, url_filter: UrlFilter):
        self.url_filter = url_filter

    def extract_links(self, base_url: str, html_content: str) -> list[str]:
        links = set()
        soup = BeautifulSoup(html_content, 'lxml')
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            absolute_link = urljoin(base_url, href)
            parsed_link = urlparse(absolute_link)
            normalized_link = parsed_link._replace(fragment="").geturl()
            if self.url_filter.is_valid(normalized_link):
                links.add(normalized_link)
        return list(links)

class WebCrawler:
    def __init__(self, config: CrawlerConfig, crawling_strategy: CrawlingStrategy, state_manager: StateManager,
                 http_client: HttpClient, url_filter: UrlFilter, link_extractor: LinkExtractor, logger: ILogger):
        self.config = config
        self.crawling_strategy = crawling_strategy
        self.state_manager = state_manager
        self.http_client = http_client
        self.url_filter = url_filter
        self.link_extractor = link_extractor
        self.logger = logger
        self.data_buffer = []
        self.pages_crawled_session = 0

    def _process_url(self, url_info: tuple[str, int]):
        url, num_redirects = url_info
        if num_redirects >= self.config.max_redirects:
            self.data_buffer.append({'URL': url, 'Status_Code': 999, 'Content': "Max redirects reached"})
            return

        status, content, redirect_url = self.http_client.fetch(url)
        self.logger.info(f"Fetched {url} [{status}]")
        self.data_buffer.append({'URL': url, 'Status_Code': status, 'Content': content if 200 <= status < 300 else ""})

        if 200 <= status < 300:
            extracted = self.link_extractor.extract_links(url, content)
            self.crawling_strategy.add_links([(link, 0) for link in extracted])
            del content
        elif redirect_url and self.url_filter.is_valid(redirect_url):
            self.crawling_strategy.add_links([(redirect_url, num_redirects + 1)])

    def _save_buffer_to_parquet(self) -> str | None:
        if not self.data_buffer:
            return None

        num_records = len(self.data_buffer)
        df = pd.DataFrame(self.data_buffer)
        today = datetime.now().date()
        df['crawl_date'] = today

        try:
            partition_path = os.path.join(self.config.parquet_path, f'crawl_date={today}')
            df.to_parquet(path=self.config.parquet_path, engine='pyarrow', compression='snappy', partition_cols=['crawl_date'])
            log_message = f"✅ Saved a batch of **{num_records}** pages to partition `{partition_path}`"
            self.logger.info(log_message)
            self.data_buffer = []
            gc.collect()
            return log_message
        except Exception as e:
            self.logger.error(f"Failed to save to Parquet: {e}")
            return None

    def crawl(self):
        pbar = tqdm(total=self.config.max_pages_to_crawl, desc="Crawling Progress")
        while self.pages_crawled_session < self.config.max_pages_to_crawl:
            if not self.crawling_strategy.has_next():
                self.logger.info("Frontier is empty. Stopping crawl.")
                break

            url_data = self.crawling_strategy.get_next()
            self._process_url(url_data)
            self.pages_crawled_session += 1
            pbar.update(1)

            save_event_msg = None
            if len(self.data_buffer) >= self.config.save_interval_pages:
                save_event_msg = self._save_buffer_to_parquet()
                if hasattr(self.crawling_strategy, 'queue'):
                    self.state_manager.save_frontier(list(self.crawling_strategy.queue))
                elif hasattr(self.crawling_strategy, 'stack'):
                    self.state_manager.save_frontier(list(self.crawling_strategy.stack))

            # THE FIX: Yield a single dictionary, which is robust.
            yield {
                "status": f"Crawled {self.pages_crawled_session}/{self.config.max_pages_to_crawl} pages.",
                "save_event": save_event_msg
            }

        pbar.close()
        self.logger.info("Crawl finished. Performing final save...")
        final_save_msg = self._save_buffer_to_parquet()
        if hasattr(self.crawling_strategy, 'queue'):
            self.state_manager.save_frontier(list(self.crawling_strategy.queue))
        elif hasattr(self.crawling_strategy, 'stack'):
            self.state_manager.save_frontier(list(self.crawling_strategy.stack))

        # THE FIX: Also yield a dictionary here.
        yield {
            "status": f"Crawl finished. Processed {self.pages_crawled_session} pages.",
            "save_event": final_save_msg
        }


def run_gradio_crawler_interface(initial_start_url: str, allowed_path_segment: str, crawling_strategy_type: str,
                                 state_db_path_input: str, parquet_path_input: str, max_pages_to_crawl: int):
    log_stream = io.StringIO()
    logger = ConsoleAndGradioLogger(log_stream)

    try:
        base_domain = urlparse(initial_start_url).netloc
        if not base_domain: raise ValueError("Invalid Initial Start URL.")

        config = CrawlerConfig(
            initial_start_url=initial_start_url, allowed_path_segment=allowed_path_segment,
            state_db_path=state_db_path_input, parquet_path=parquet_path_input,
            max_pages_to_crawl=max_pages_to_crawl, base_domain=base_domain
        )
        os.makedirs(config.parquet_path, exist_ok=True)
        yield "Initializing...", log_stream.getvalue(), "### Save Events Log\n\n- Waiting for first save event...", ""

        state_manager = StateManager(config.state_db_path, logger)
        visited_manager = VisitedUrlManager()

        logger.info("Rebuilding visited set from existing Parquet data...")
        try:
            parquet_glob_path = os.path.join(config.parquet_path, '**', '*.parquet')
            visited_urls_df = duckdb.query(f"SELECT DISTINCT URL FROM read_parquet('{parquet_glob_path}')").to_df()
            for url in visited_urls_df['URL']:
                visited_manager.add(url)
            logger.info(f"Rebuilt visited set with {visited_manager.size()} URLs.")
        except Exception as e:
            logger.warning(f"Could not rebuild visited set from Parquet (may be a new crawl): {e}")

        strategy_class = BFSCrawlingStrategy if crawling_strategy_type == 'BFS' else DFSCrawlingStrategy
        crawling_strategy = strategy_class(visited_manager, logger)

        loaded_frontier = state_manager.load_frontier()
        unvisited_frontier = [info for info in loaded_frontier if not visited_manager.contains(info[0])]

        if unvisited_frontier:
            crawling_strategy.prime_with_frontier(unvisited_frontier)
        elif not visited_manager.contains(config.initial_start_url):
             crawling_strategy.add_links([(config.initial_start_url, 0)])

        http_client = HttpClient(config, logger)
        url_filter = UrlFilter(config.allowed_path_segment, config.base_domain)
        link_extractor = LinkExtractor(url_filter)
        crawler = WebCrawler(config, crawling_strategy, state_manager, http_client, url_filter, link_extractor, logger)

        final_status = ""
        save_events_log = ["### Save Events Log"]

        # THE FIX: Loop receives a single 'event' dictionary.
        for event in crawler.crawl():
            # Safely get values from the dictionary.
            status_msg = event.get("status")
            save_event = event.get("save_event")

            final_status = status_msg
            if save_event:
                save_events_log.append(f"- {save_event}")
            yield status_msg, log_stream.getvalue(), "\n".join(save_events_log), ""

        logger.info("Generating final summary from Parquet data...")
        final_save_events = "\n".join(save_events_log)
        summary_md = f"## Crawl Session Finished\n\n- **Status**: {final_status}\n- **Crawled Data Location**: `{config.parquet_path}`"
        try:
            parquet_glob_path = os.path.join(config.parquet_path, '**', '*.parquet')
            summary_df = duckdb.query(f"SELECT CASE WHEN Status_Code >= 200 AND Status_Code < 300 THEN 'Success (Content Saved)' WHEN Status_Code >= 300 AND Status_Code < 400 THEN 'Redirect' ELSE 'Error / Other' END AS Category, COUNT(*) as Total FROM read_parquet('{parquet_glob_path}') GROUP BY Category ORDER BY Total DESC").to_df()
            total_urls = summary_df['Total'].sum()
            summary_md += f"\n- **Total URLs in Parquet Dataset**: {total_urls}\n\n### Crawl Summary by Category\n\n"
            summary_md += summary_df.to_markdown(index=False)
        except Exception as e:
            logger.error(f"Could not generate summary from Parquet: {e}")
            summary_md += "\n\n**Could not generate summary from Parquet data.**"

        yield final_status, log_stream.getvalue(), final_save_events, summary_md

    except Exception as e:
        logger.exception(f"A critical error occurred: {e}")
        yield "Crawl Failed!", log_stream.getvalue(), "", f"**Error:** {e}"

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🕸️ Memory-Optimized Parquet Web Crawler")
    gr.Markdown("This crawler saves data to a partitioned Parquet dataset and uses SQLite only to manage the crawl state.")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## Crawler Configuration")
            initial_url_input = gr.Textbox(label="Initial Start URL", value=CrawlerConfig.initial_start_url)
            allowed_path_input = gr.Textbox(label="Allowed Path Segment", value=CrawlerConfig.allowed_path_segment)
            crawling_strategy_radio = gr.Radio(choices=['BFS', 'DFS'], label="Crawling Strategy", value='BFS')
            max_pages_input = gr.Number(label="Maximum Pages to Crawl (per session)", value=CrawlerConfig.max_pages_to_crawl, minimum=1, step=100)
            gr.Markdown("### Storage Paths")
            state_db_path_input = gr.Textbox(label="Crawl State DB Path (SQLite)", value=CrawlerConfig.state_db_path)
            parquet_path_input = gr.Textbox(label="Crawled Data Path (Parquet)", value=CrawlerConfig.parquet_path)
        with gr.Column(scale=2):
            gr.Markdown("## Actions and Status")
            start_button = gr.Button("🚀 Start Crawl", variant="primary")
            status_message_output = gr.Textbox(label="Status Message", interactive=False)
            logs_output = gr.Textbox(label="Crawler Logs", interactive=False, lines=15, max_lines=20)
            with gr.Row():
                save_events_output = gr.Markdown("### Save Events Log")
                summary_output = gr.Markdown("---")


    start_button.click(
        fn=run_gradio_crawler_interface,
        inputs=[initial_url_input, allowed_path_input, crawling_strategy_radio, state_db_path_input, parquet_path_input, max_pages_input],
        outputs=[status_message_output, logs_output, save_events_output, summary_output]
    )

demo.launch(debug=True)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Google Drive mounted successfully.
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ad411b623d7ce67ed2.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


2025-06-15 11:49:11,951 - INFO - Crawl frontier table ensured to exist.
INFO:CrawlerLogger:Crawl frontier table ensured to exist.
2025-06-15 11:49:11,953 - INFO - Rebuilding visited set from existing Parquet data...
INFO:CrawlerLogger:Rebuilding visited set from existing Parquet data...
2025-06-15 11:49:12,003 - INFO - Rebuilt visited set with 20 URLs.
INFO:CrawlerLogger:Rebuilt visited set with 20 URLs.
Crawling Progress:   0%|          | 0/700 [00:00<?, ?it/s]2025-06-15 11:49:14,187 - INFO - Fetched https://adevait.com/blog/remote-work/managing-remote-teams-the-key-to-employee-engagement [200]
INFO:CrawlerLogger:Fetched https://adevait.com/blog/remote-work/managing-remote-teams-the-key-to-employee-engagement [200]
Crawling Progress:   0%|          | 1/700 [00:02<25:39,  2.20s/it]2025-06-15 11:49:16,851 - INFO - Fetched https://adevait.com/blog/startups [200]
INFO:CrawlerLogger:Fetched https://adevait.com/blog/startups [200]
Crawling Progress:   0%|          | 2/700 [00:04<28:45,  2.4

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://ad411b623d7ce67ed2.gradio.live


KeyboardInterrupt: 