# Docu_Manim_Scrap.py

**26 minutes**

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from weasyprint import HTML
from urllib.parse import urljoin
from tqdm import tqdm
import time
import json

# Detect if the environment is Google Colab or local
try:
    from google.colab import drive
    COLAB_ENV = True
except ImportError:
    COLAB_ENV = False

# Base URL of the Manim documentation
BASE_URL = "https://docs.manim.community/en/stable/reference.html"

# Set output directory based on environment
if COLAB_ENV:
    OUTPUT_DIR = "/content/manim_docs"
else:
    OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "manim_docs")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# File to track downloaded links
PROGRESS_FILE = os.path.join(OUTPUT_DIR, "progress.json")

# Maximum depth for scraping
MAX_DEPTH = 1

# Load or initialize progress tracking
def load_progress():
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, "r") as file:
            return json.load(file)
    return {"downloaded": [], "visited": []}

def save_progress(progress):
    with open(PROGRESS_FILE, "w") as file:
        json.dump(progress, file)

def get_links(base_url, current_depth):
    """
    Collects all unique links from the given page up to a specified depth.

    Args:
        base_url (str): The base URL of the documentation.
        current_depth (int): Current depth of scraping.

    Returns:
        list: A list of unique absolute URLs found on the page.
    """
    if current_depth > MAX_DEPTH:
        return []

    response = requests.get(base_url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch {base_url}: {response.status_code}")

    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for a_tag in soup.find_all('a', href=True):
        url = urljoin(base_url, a_tag['href'])
        if "https://docs.manim.community/en/stable/" in url:
            links.append(url)
    return list(set(links))  # Remove duplicates

def make_links_absolute(html_content, base_url):
    """
    Converts all relative links in the HTML content to absolute URLs.

    Args:
        html_content (str): The HTML content as a string.
        base_url (str): The base URL to resolve relative links.

    Returns:
        str: The updated HTML content with absolute links.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    for tag in soup.find_all(['a', 'img'], href=True):
        tag['href'] = urljoin(base_url, tag['href'])
    for tag in soup.find_all('img', src=True):
        tag['src'] = urljoin(base_url, tag['src'])
    return str(soup)

def download_page(url, output_dir, progress):
    """
    Downloads the content of a given URL and saves it as an HTML file.

    Args:
        url (str): The URL to download.
        output_dir (str): The directory to save the HTML file.
        progress (dict): The progress tracker dictionary.

    Returns:
        str: The file path of the saved HTML file, or None if the download fails.
    """
    try:
        # Skip if already downloaded
        if url in progress["downloaded"]:
            return None

        response = requests.get(url)
        if response.status_code == 200:
            # Convert links to absolute for proper functionality
            html_content = make_links_absolute(response.text, url)
            file_name = url.split('/')[-1] or "index.html"
            file_path = os.path.join(output_dir, f"{file_name}.html")
            with open(file_path, "w", encoding="utf-8") as file:
                file.write(html_content)
            progress["downloaded"].append(url)
            save_progress(progress)
            return file_path
        else:
            print(f"Failed to download {url}: {response.status_code}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")
    return None

def html_to_pdf(html_files, output_pdf):
    """
    Converts a list of HTML files into a single PDF file.

    Args:
        html_files (list): List of file paths to HTML files.
        output_pdf (str): The path to the output PDF file.
    """
    pdf_pages = []
    for html_file in tqdm(html_files, desc="Rendering HTML to PDF"):
        try:
            pdf_pages.append(HTML(html_file).render())
        except Exception as e:
            print(f"Error rendering {html_file}: {e}")
    if pdf_pages:
        combined_pdf = pdf_pages[0]
        for page in pdf_pages[1:]:
            combined_pdf.pages.extend(page.pages)
        combined_pdf.write_pdf(output_pdf)
        print(f"Final PDF generated: {output_pdf}")
    else:
        print("No PDF pages were generated.")

def main():
    """
    Main function to scrape the Manim documentation and export it as a PDF.
    """
    try:
        # Step 1: Collect all relevant links
        print("Collecting links from the documentation...")
        progress = load_progress()
        if "visited" not in progress:
            progress["visited"] = []

        links_to_visit = [BASE_URL]
        all_links = set()

        for depth in range(1, MAX_DEPTH + 1):
            new_links = []
            for link in links_to_visit:
                if link not in progress["visited"]:
                    progress["visited"].append(link)
                    save_progress(progress)
                    new_links.extend(get_links(link, depth))
            all_links.update(new_links)
            links_to_visit = new_links

        print(f"Found {len(all_links)} links within depth {MAX_DEPTH}.")

        # Step 2: Download all pages with a progress bar
        print("Downloading pages...")
        html_files = []
        for link in tqdm(all_links, desc="Downloading HTML pages"):
            file_path = download_page(link, OUTPUT_DIR, progress)
            if file_path:
                html_files.append(file_path)
            time.sleep(0.5)  # Adjusted limit

        # Step 3: Convert HTML files to a single PDF
        print("Generating PDF...")
        pdf_path = os.path.join(OUTPUT_DIR, "manim_docs_complete.pdf")
        html_to_pdf([os.path.join(OUTPUT_DIR, f) for f in os.listdir(OUTPUT_DIR) if f.endswith('.html')], pdf_path)

        # Step 4: Handle output for Colab
        if COLAB_ENV:
            from google.colab import files
            print("Downloading PDF to local machine...")
            files.download(pdf_path)

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    if COLAB_ENV:
        print("Running in Google Colab environment.")
    else:
        print("Running in local environment.")
    main()


# Create_Project_Estructure.py

In [11]:
import os

def create_project_structure(root_dir):
    structure = {
        "assets": ["audio", "images", "videos"],
        "docs": [],
        "exports": ["reels", "shorts", "horizontal"],
        "presets": [],
        "scenes": ["examples", "storytelling", "educational", "transitions"],
        "scripts": [],
        "templates": [],
        "tests": [],
        "notebooks": []  # Notebooks for development and exploration
    }

    files = {
        "docs": ["README.md", "SETUP.md", "USAGE.md", "ROADMAP.md"],
        "presets": ["color_schemes.py", "transitions.py", "effects.py", "typography.py"],
        "scripts": ["batch_render.py", "audio_sync.py", "video_export.py", "util.py"],
        "templates": ["base_scene.py", "audio_scene.py"],
        "tests": ["test_presets.py", "test_scenes.py", "test_utils.py"],
        "notebooks": ["README.md"],  # Added README for notebooks folder
        "root": [".gitignore", "config.py", "requirements.txt", "run.py"]
    }

    comments = {
        "assets": "# Media files for animations (audio, images, videos)",
        "docs": "# Documentation of the project",
        "exports": "# Exported videos (reels, shorts, horizontal)",
        "notebooks": "# Notebooks for development and exploration",
        "presets": "# Reusable visual effects and style configurations",
        "scenes": "# Manim scenes (examples, storytelling, etc.)",
        "scripts": "# Automation tools and utility functions",
        "templates": "# Base classes and reusable components for scenes",
        "tests": "# Unit tests for ensuring stability"
    }

    # Create root directory
    os.makedirs(root_dir, exist_ok=True)

    # Create folders and subfolders
    for folder, subfolders in structure.items():
        folder_path = os.path.join(root_dir, folder)
        os.makedirs(folder_path, exist_ok=True)

        for subfolder in subfolders:
            os.makedirs(os.path.join(folder_path, subfolder), exist_ok=True)

    # Create files in specific folders
    for folder, filenames in files.items():
        target_dir = root_dir if folder == "root" else os.path.join(root_dir, folder)

        for filename in filenames:
            file_path = os.path.join(target_dir, filename)
            with open(file_path, "w") as f:
                if filename.endswith(".md"):
                    f.write(f"# {filename.split('.')[0]}\n")  # Basic header for Markdown files
                elif filename == ".gitignore":
                    f.write("# Ignore files\n*.pyc\n__pycache__/\n")
                else:
                    f.write("# Placeholder content\n")

    print(f"Project structure created at: {root_dir}")

def create_directory_tree_txt(path, output_file=None):
    if output_file is None:
        output_file = os.path.join(path, "directory_tree.txt")

    comments = {
        "assets": "# Media files for animations (audio, images, videos)",
        "docs": "# Documentation of the project",
        "exports": "# Exported videos (reels, shorts, horizontal)",
        "notebooks": "# Notebooks for development and exploration",
        "presets": "# Reusable visual effects and style configurations",
        "scenes": "# Manim scenes (examples, storytelling, etc.)",
        "scripts": "# Automation tools and utility functions",
        "templates": "# Base classes and reusable components for scenes",
        "tests": "# Unit tests for ensuring stability"
    }

    def generate_tree(path, prefix=""):
        entries = os.listdir(path)
        entries.sort()
        lines = []
        for index, entry in enumerate(entries):
            full_path = os.path.join(path, entry)
            connector = "└── " if index == len(entries) - 1 else "├── "
            comment = f" {comments.get(entry, '')}" if entry in comments else ""
            lines.append(f"{prefix}{connector}{entry}{comment}")
            if os.path.isdir(full_path):
                extension = "    " if index == len(entries) - 1 else "│   "
                lines.extend(generate_tree(full_path, prefix + extension))
        return lines

    tree_lines = generate_tree(path)

    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(tree_lines))

    print(f"Directory tree saved to {output_file}")


In [None]:

if __name__ == "__main__":
    root_dir = r"C:\Users\User\Projects_Unprotected\Media_Generators"
    create_project_structure(root_dir)
    create_directory_tree_txt(root_dir)


Project structure created at: C:\Users\User\Projects_Unprotected\Media_Generators\notebooks
Directory tree saved to C:\Users\User\Projects_Unprotected\Media_Generators\notebooks\directory_tree.txt


In [7]:

# Example usage
# Replace "your_project_directory" with the desired path for the project
create_project_structure(r"C:\Users\User\Projects_Unprotected\Media_Generators")


Project structure created at: C:\Users\User\Projects_Unprotected\Media_Generators


In [10]:
create_directory_tree_txt(r"C:\Users\User\Projects_Unprotected\Media_Generators")

Directory tree saved to C:\Users\User\Projects_Unprotected\Media_Generators\directory_tree.txt
