In [2]:
import os
import re
import glob
from tqdm import tqdm

def clean_markdown_folder(directory):
    # This regex specifically targets the Base64 data string inside the parentheses 
    # but leaves the rest of the Markdown link structure intact.
    # It looks for: (data:image/{anything};base64,{long string})
    base64_data_pattern = r'\(data:image/[^;]+;base64,[A-Za-z0-9+/=]{50,}\)'
    
    # Also catch standalone HTML <img> tags that might contain Base64
    html_img_base64 = r'<img[^>]+src="data:image/[^;]+;base64,[A-Za-z0-9+/=]{50,}"[^>]*>'

    markdown_files = glob.glob(os.path.join(directory, '**/*.md'), recursive=True)
    
    print(f"Found {len(markdown_files)} files in {directory}. Starting surgical cleaning...")

    for filepath in tqdm(markdown_files):
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()

        original_length = len(content)
        
        # 1. Replace the Base64 data with an empty set of parentheses or a small label
        # This turns [ ![Alt](data:base64...) ](url) into [ ![Alt]() ](url)
        content = re.sub(base64_data_pattern, '()', content)
        
        # 2. Remove full HTML image tags that contain Base64 (these are usually just noise)
        content = re.sub(html_img_base64, '', content, flags=re.IGNORECASE)
        
        # 3. Clean up formatting: remove lines that are now just empty image tags "!()"
        content = re.sub(r'!\[\]\(\)', '', content)
        
        # 4. Collapse excessive white space left behind
        content = re.sub(r'\n\s*\n', '\n\n', content)

        if len(content) < original_length:
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(content.strip())

    print("\nCleaning complete. Base64 elements removed, links preserved.")

# Usage - Point this to your specific folder
clean_markdown_folder("markdown_hr")

Found 1095 files in markdown_hr. Starting surgical cleaning...


100%|██████████| 1095/1095 [00:00<00:00, 4082.85it/s]


Cleaning complete. Base64 elements removed, links preserved.



