In [None]:
import os
from bs4 import BeautifulSoup, Comment
from IPython.display import display, HTML , Markdown
from pathlib import Path
import json
from markdownify import markdownify as md
from tqdm import tqdm
import re
import requests
from urllib.parse import urljoin, urlparse

In [None]:
RAW_DATA_DIR = "data\\resources_raw"
HTML_FILES = [Path(RAW_DATA_DIR) / html_name for html_name in os.listdir(RAW_DATA_DIR) if html_name.endswith(".html")]

In [None]:


def collect_hash_sources(root_dir):
    """
    Recursively traverse a directory structure, read all JSON files,
    and build a dictionary mapping:

      html_hash      -> url
      external_hash  -> external_url
      file_hash      -> file_download_url

    pdf_hash is IGNORED.
    """
    hash_map = {}

    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if not filename.lower().endswith(".json"):
                continue

            json_path = os.path.join(dirpath, filename)

            # Parse JSON
            try:
                with open(json_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
            except Exception as e:
                print(f"⚠️ Could not parse {json_path}: {e}")
                continue

            # ---- html_hash → url ----
            if "html_hash" in data and data["html_hash"]:
                if "url" in data and data["url"]:
                    hash_map[data["html_hash"]] = data["url"]

            # ---- external_hash → external_url ----
            if "external_hash" in data and data["external_hash"]:
                if "external_url" in data and data["external_url"]:
                    hash_map[data["external_hash"]] = data["external_url"]

            # ---- file_hash → file_download_url ----
            if "file_hash" in data and data["file_hash"]:
                if "file_download_url" in data and data["file_download_url"]:
                    hash_map[data["file_hash"]] = data["file_download_url"]

            # pdf_hash IS IGNORED by your request

    return hash_map
HASH_TO_URL_MAP = collect_hash_sources("D:\\Projects\\Python\\btp-rag\\data\\resources_metadata")

In [None]:
def clean_html(html: str) -> str:
    """
    Clean HTML by removing scripts, styles, comments,
    inline CSS, noisy attributes, and empty tags.
    """
    soup = BeautifulSoup(html, "html.parser")

    # --- Remove scripts & styles ---
    for tag in soup(["script", "style"]):
        tag.decompose()

    # --- Remove comments ---
    for c in soup.find_all(string=lambda text: isinstance(text, Comment)):
        c.extract()

    # --- Remove inline styles & noisy attributes ---
    noisy_attrs = ["class", "id", "style", "onclick", "onload", "width", "height"]
    for tag in soup.find_all():
        for a in noisy_attrs:
            if a in tag.attrs:
                del tag[a]

    # --- Remove <head> entirely (keeps only content skeleton) ---
    if soup.head:
        soup.head.decompose()

    # --- Remove empty tags except img ---
    for tag in soup.find_all():
        if tag.name == "img":
            continue  # KEEP IMAGES
        if not tag.get_text(strip=True) and not tag.contents:
            tag.decompose()


    return str(soup)


def html_to_markdown(html: str) -> str:
    """
    Convert cleaned HTML to Markdown.
    """
    clean = clean_html(html)
    markdown = md(clean, heading_style="ATX")
    return markdown.strip()


def convert_directory_to_markdown(directory: str):
    """
    Process all .html files in a directory, convert
    each to clean Markdown, and display in notebook.
    """
    html_files = [f for f in os.listdir(directory) if f.lower().endswith(".html")]

    if not html_files:
        print("No HTML files found.")
        return

    for file in html_files:
        path = os.path.join(directory, file)
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            raw = f.read()

        md_content = html_to_markdown(raw)

        display(Markdown(f"### **{file}**\n\n```\n{md_content}\n```"))





In [None]:

def has_local_image_refs(html: str) -> bool:
    """
    Returns True if the HTML contains <img> tags whose src is NOT
    an absolute URL (http/https) but a local relative reference.
    """
    soup = BeautifulSoup(html, "html.parser")

    for img in soup.find_all("img"):
        src = img.get("src")

        # Missing or empty src → skip
        if not src:
            continue
        
        src = src.strip()

        # Ignore fully qualified URLs
        if src.startswith("http://") or src.startswith("https://"):
            continue
        
        # Everything else = local file reference
        return True

    return False


count_with_local_imgs = 0

for file in tqdm(HTML_FILES):
    with open(file, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()

    if has_local_image_refs(raw):
        count_with_local_imgs += 1
        
print(f"Out of {len(HTML_FILES)} HTML files, {count_with_local_imgs} have local image references.")

  0%|          | 0/1177 [00:00<?, ?it/s]

100%|██████████| 1177/1177 [00:20<00:00, 56.79it/s]

Out of 1177 HTML files, 142 have local image references.





In [None]:

def save_markdown_and_images(html_path: str, output_base_dir: str, HASH_TO_URL_MAP: dict):
    """
    Convert HTML to Markdown, download images, and save everything
    in a directory named after the input HTML file.
    This version correctly resolves relative images using the original source URL.
    """

    # -----------------------------
    # Load raw HTML
    # -----------------------------
    with open(html_path, "r", encoding="utf-8", errors="ignore") as f:
        raw_html = f.read()

    # -----------------------------
    # Extract hash from filename
    # -----------------------------
    filename = os.path.splitext(os.path.basename(html_path))[0]
    html_hash = filename  # name *is* the hash

    # -----------------------------
    # Determine original page URL
    # -----------------------------
    original_url = HASH_TO_URL_MAP.get(html_hash)
    if not original_url:
        raise ValueError(f"❌ No source URL found for hash: {html_hash}")

    # -----------------------------
    # Build output directory
    # -----------------------------
    out_dir = os.path.join(output_base_dir, filename)
    img_dir = os.path.join(out_dir, "images")
    os.makedirs(img_dir, exist_ok=True)

    # -----------------------------
    # Clean HTML but KEEP IMAGES
    # -----------------------------
    soup = BeautifulSoup(raw_html, "html.parser")

    # Remove scripts & styles
    for tag in soup(["script", "style"]):
        tag.decompose()

    # Remove comments
    for c in soup.find_all(string=lambda t: isinstance(t, Comment)):
        c.extract()

    # Remove noisy attributes
    noisy_attrs = ["class", "id", "style", "onclick", "onload", "width", "height"]
    for tag in soup.find_all():
        for a in noisy_attrs:
            if a in tag.attrs:
                del tag[a]

    # Remove <head>
    if soup.head:
        soup.head.decompose()

    # Remove empty tags EXCEPT <img>
    for tag in soup.find_all():
        if tag.name == "img":
            continue
        if not tag.get_text(strip=True) and not tag.contents:
            tag.decompose()

    cleaned_html = str(soup)

    # -----------------------------
    # Convert to Markdown
    # -----------------------------
    markdown = md(cleaned_html, heading_style="ATX")

    # -----------------------------
    # Extract and download images
    # -----------------------------
    images = soup.find_all("img")
    img_map = {}  # old_url → new_local_path

    for idx, img in enumerate(images, start=1):
        src = img.get("src")
        if not src or not src.strip():
            continue

        # ------------------------------------------
        # Resolve final image URL via base page URL
        # ------------------------------------------
        # If src is absolute → keep as is
        if src.startswith("http://") or src.startswith("https://"):
            abs_url = src
        else:
            # src is relative → resolve using original URL
            abs_url = urljoin(original_url, src)

        # ------------------------------------------
        # Download the image
        # ------------------------------------------
        try:
            r = requests.get(abs_url, timeout=10)
            if r.status_code == 200:
                # Compute extension
                ext = os.path.splitext(urlparse(abs_url).path)[1] or ".jpg"
                local_name = f"img_{idx:03d}{ext}"
                local_path = os.path.join(img_dir, local_name)

                with open(local_path, "wb") as f:
                    f.write(r.content)

                img_map[src] = f"images/{local_name}"

        except Exception:
            # If download fails → do not include broken link
            continue

    # -----------------------------
    # Rewrite image URLs in markdown
    # -----------------------------
    for original, local in img_map.items():
        markdown = markdown.replace(original, local)

    # -----------------------------
    # Save final Markdown
    # -----------------------------
    md_path = os.path.join(out_dir, f"{filename}.md")
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(markdown)

In [None]:
output_dir = "data/resources_md" # Output directory for markdown files and images
for html_file in tqdm(HTML_FILES):
    save_markdown_and_images(html_file, output_dir, HASH_TO_URL_MAP)

100%|██████████| 1177/1177 [2:23:30<00:00,  7.32s/it] 
