<a href="https://colab.research.google.com/github/Hafeedh-lab/Non-proprietary-projects/blob/main/DOCX_TO_MD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
#  Optimized Word → Markdown batch converter for Google Colab
#  • Converts every .docx in a chosen Drive folder to .md
#  • Enhanced with error handling, parallel processing, and validation
#  • Uses pandoc under the hood via the pypandoc wrapper
#  • Tested August 2025 on Colab's default Python 3.10 runtime
# ============================================================

import os
import glob
import pypandoc
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
import logging
from typing import List, Tuple, Optional
import hashlib
from datetime import datetime

# ── 1. Install dependencies (quiet = less scrolling) ─────────
print("Installing dependencies...")
!pip install -q "pypandoc>=1.12" python-docx tqdm

# ── Install pandoc binary ────────────────────────────────────
print("Installing pandoc...")
!apt-get update && apt-get install -y pandoc

# ── 2. Mount Google Drive ────────────────────────────────────
from google.colab import drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# ── 3. Setup logging ─────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/content/conversion_log.txt'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# ── 4. Configuration ─────────────────────────────────────────
class Config:
    # EDIT these paths to suit your Drive structure
    INPUT_FOLDER = '/content/drive/MyDrive/DOCX TO MD TEST'      # ← .docx live here
    OUTPUT_FOLDER = '/content/drive/MyDrive/DOCX TO MD TEST'  # ← .md will be saved here

    # Performance settings
    MAX_WORKERS = 4  # Parallel processing threads
    SKIP_EXISTING = True  # Skip files that already exist in output
    VALIDATE_DOCX = True  # Check if .docx files are valid before conversion

    # Pandoc settings
    PANDOC_FORMAT = 'gfm'  # GitHub-flavoured Markdown
    PANDOC_ARGS = ['--wrap=none', '--extract-media=./media']

def validate_docx_file(file_path: str) -> bool:
    """Check if a .docx file is valid and readable."""
    try:
        from docx import Document
        Document(file_path)
        return True
    except Exception as e:
        logger.warning(f"Invalid .docx file {file_path}: {e}")
        return False

def get_file_hash(file_path: str) -> str:
    """Generate MD5 hash of file for change detection."""
    hash_md5 = hashlib.md5()
    try:
        with open(file_path, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        return hash_md5.hexdigest()
    except Exception:
        return ""

def should_convert_file(docx_path: str, md_path: str, skip_existing: bool) -> bool:
    """Determine if file needs conversion based on existence and modification time."""
    if not skip_existing:
        return True

    if not os.path.exists(md_path):
        return True

    # Check if source is newer than target
    docx_mtime = os.path.getmtime(docx_path)
    md_mtime = os.path.getmtime(md_path)

    return docx_mtime > md_mtime

def convert_single_file(args: Tuple[str, str, dict]) -> Tuple[bool, str, Optional[str]]:
    """
    Convert a single .docx file to markdown.
    Returns: (success, file_path, error_message)
    """
    docx_path, output_folder, config = args

    try:
        # Validate input file if requested
        if config.get('validate_docx', True) and not validate_docx_file(docx_path):
            return False, docx_path, "Invalid .docx file"

        # Prepare output path, preserving directory structure
        rel_path = os.path.relpath(docx_path, config['input_folder'])
        basename = os.path.splitext(os.path.basename(rel_path))[0] + '.md'

        # Create subdirectory structure in output folder
        subdir = os.path.dirname(rel_path)
        if subdir:
            output_subdir = os.path.join(output_folder, subdir)
            os.makedirs(output_subdir, exist_ok=True)
            md_path = os.path.join(output_subdir, basename)
        else:
            md_path = os.path.join(output_folder, basename)

        # Check if conversion is needed
        if not should_convert_file(docx_path, md_path, config.get('skip_existing', True)):
            return True, docx_path, "Skipped (already exists and up-to-date)"

        # Convert using pypandoc
        pypandoc.convert_file(
            source_file=docx_path,
            to=config.get('pandoc_format', 'gfm'),
            outputfile=md_path,
            extra_args=config.get('pandoc_args', ['--wrap=none'])
        )

        return True, docx_path, None

    except Exception as e:
        return False, docx_path, str(e)

def find_docx_files(input_folder: str) -> List[str]:
    """Find all .docx files in the input folder recursively."""
    docx_files = []

    # Use pathlib for better path handling
    input_path = Path(input_folder)
    if not input_path.exists():
        raise FileNotFoundError(f"Input folder does not exist: {input_folder}")

    # Find all .docx files recursively
    for docx_file in input_path.rglob('*.docx'):
        # Skip temporary files (starting with ~$)
        if not docx_file.name.startswith('~$'):
            docx_files.append(str(docx_file))

    return sorted(docx_files)

def main():
    """Main conversion function."""
    config = {
        'input_folder': Config.INPUT_FOLDER,
        'output_folder': Config.OUTPUT_FOLDER,
        'max_workers': Config.MAX_WORKERS,
        'skip_existing': Config.SKIP_EXISTING,
        'validate_docx': Config.VALIDATE_DOCX,
        'pandoc_format': Config.PANDOC_FORMAT,
        'pandoc_args': Config.PANDOC_ARGS
    }

    print(f"Starting conversion process at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    logger.info(f"Input folder: {config['input_folder']}")
    logger.info(f"Output folder: {config['output_folder']}")

    # Create output directory
    os.makedirs(config['output_folder'], exist_ok=True)

    # Find all .docx files
    print("Scanning for .docx files...")
    try:
        docx_files = find_docx_files(config['input_folder'])
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return

    if not docx_files:
        print(f"No .docx files found in {config['input_folder']}")
        return

    print(f"Found {len(docx_files)} .docx file(s)")

    # Prepare arguments for parallel processing
    conversion_args = [(docx_path, config['output_folder'], config) for docx_path in docx_files]

    # Convert files with progress bar
    successful_conversions = 0
    failed_conversions = 0
    skipped_conversions = 0

    with ThreadPoolExecutor(max_workers=config['max_workers']) as executor:
        # Submit all tasks
        future_to_file = {
            executor.submit(convert_single_file, args): args[0]
            for args in conversion_args
        }

        # Process results with progress bar
        with tqdm(total=len(docx_files), desc='Converting files') as pbar:
            for future in as_completed(future_to_file):
                success, file_path, error_msg = future.result()

                rel_path = os.path.relpath(file_path, config['input_folder'])

                if success:
                    if error_msg == "Skipped (already exists and up-to-date)":
                        skipped_conversions += 1
                        logger.info(f"⏭️  Skipped: {rel_path}")
                    else:
                        successful_conversions += 1
                        logger.info(f"✅ Converted: {rel_path}")
                else:
                    failed_conversions += 1
                    logger.error(f"❌ Failed: {rel_path} - {error_msg}")

                pbar.update(1)

    # Summary
    print(f"\n{'='*60}")
    print(f"Conversion Summary:")
    print(f"  ✅ Successful: {successful_conversions}")
    print(f"  ⏭️  Skipped: {skipped_conversions}")
    print(f"  ❌ Failed: {failed_conversions}")
    print(f"  📁 Output folder: {config['output_folder']}")
    print(f"  📋 Log file: /content/conversion_log.txt")
    print(f"{'='*60}")

    if failed_conversions > 0:
        print("⚠️  Some files failed to convert. Check the log file for details.")

# ── 5. Run the conversion ────────────────────────────────────
if __name__ == "__main__":
    main()

Installing dependencies...
Installing pandoc...
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,853 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,161 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Ge

Converting files:   0%|          | 0/1 [00:00<?, ?it/s]


Conversion Summary:
  ✅ Successful: 1
  ⏭️  Skipped: 0
  ❌ Failed: 0
  📁 Output folder: /content/drive/MyDrive/DOCX TO MD TEST
  📋 Log file: /content/conversion_log.txt
