<a href="https://colab.research.google.com/github/HirotoShioi/repo-digest-tool/blob/main/RepoDigest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Repo Digest Tool

Extract and summarize contents from GitHub repositories with advanced filtering options, tailored for LLM data preparation—all within Google Colab.

## Description

This tool allows you to clone a GitHub repository, filter its files based on specified criteria, aggregate their contents, and download the combined result—all from within a Google Colab environment. It's particularly useful for preparing data for Large Language Models (LLMs) by extracting relevant code and documentation from repositories.

## Features

- **Clone Repositories**: Clone public or private GitHub repositories directly in Colab.
- **Advanced Filtering**:
  - Filter files by target directories and file extensions.
  - Ignore specific files and directories based on patterns.
- **Content Aggregation**: Combine the contents of filtered files into a single text file.
- **Easy Download**: Automatically download the aggregated file within the Colab interface.
- **Customizable**: Adjust parameters to suit different repositories and requirements.

In [1]:
import os
import glob
import fnmatch
import shutil
from datetime import datetime
from typing import Optional, List
from google.colab import files
import subprocess


def download_repo(repo_url: str, repo_id: str, github_token: Optional[str] = None, branch: Optional[str] = None):
    """
    Clone the specified GitHub repository.
    """
    if not repo_url.startswith("https://github.com/"):
        raise ValueError("Invalid GitHub URL. Please provide a valid GitHub repository URL.")

    # Ensure tmp directory exists
    os.makedirs("tmp", exist_ok=True)

    repo_path = f"tmp/{repo_id}"
    # Check and remove the existing clone directory
    if os.path.exists(repo_path):
        print(f"Cleaning up existing directory: {repo_path}")
        shutil.rmtree(repo_path)

    cmd = ["git", "clone", "--depth=1"]
    if branch:
        cmd.extend(["--branch", branch])

    if github_token:
        # Embed the token into the URL
        repo_url = repo_url.replace(
            "https://github.com/", f"https://{github_token}:x-oauth-basic@github.com/"
        )
    cmd.append(repo_url)
    cmd.append(repo_path)  # Specify the clone destination directory

    # Execute the command
    try:
        subprocess.run(cmd, check=True, text=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Failed to clone the repository: {e.stderr or e}")


def process_repo(
    repo_id: str,
    target_dir: Optional[str] = None,
    extensions: Optional[List[str]] = None,
    ignore_files: Optional[List[str]] = None,
    ignore_dirs: Optional[List[str]] = None,
):
    """
    Filter files in the repository based on specified conditions and combine them into a single file.
    """
    repo_path = f"tmp/{repo_id}"
    if not os.path.exists(repo_path):
        raise ValueError(f"Repository path '{repo_path}' does not exist.")

    # Set default values
    extensions = extensions or []
    ignore_files = ignore_files or []
    ignore_dirs = ignore_dirs or []

    # Process target patterns
    if isinstance(target_dir, str):
        target_patterns = [target_dir.strip()]
    elif isinstance(target_dir, list):
        target_patterns = [pattern.strip() for pattern in target_dir]
    else:
        target_patterns = ["**"]  # Default to target the entire repository

    # Expand ignore directory patterns
    ignore_dir_paths = set()
    for ignore_pattern in ignore_dirs:
        full_ignore_pattern = os.path.join(repo_path, ignore_pattern.strip())
        ignore_dir_paths.update(glob.glob(full_ignore_pattern, recursive=True))

    # File filtering
    filtered_files = []
    for pattern in target_patterns:
        full_pattern = os.path.join(repo_path, pattern)
        for file_path in glob.glob(full_pattern, recursive=True):
            if os.path.isfile(file_path):
                relative_path = os.path.relpath(file_path, repo_path)
                # Check if the file is in an ignored directory
                if any(os.path.commonpath([file_path, ignore_dir]) == ignore_dir for ignore_dir in ignore_dir_paths):
                    continue
                # Check if the file matches any ignore file patterns
                if any(fnmatch.fnmatch(relative_path, pattern) for pattern in ignore_files):
                    continue
                # Filter by extensions
                if not extensions or any(fnmatch.fnmatch(relative_path, pattern) for pattern in extensions):
                    filtered_files.append(file_path)

    # Create a list of files
    file_list = [os.path.relpath(file_path, repo_path) for file_path in filtered_files]

    # Combine file contents
    output_content = []
    output_content.append("\n".join(file_list))  # Add the file list at the beginning
    for file_path in filtered_files:
        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                relative_path = os.path.relpath(file_path, repo_path)
                output_content.append(f"# {relative_path}\n{f.read()}\n")
        except Exception as e:
            output_content.append(f"# Error reading file {relative_path}: {e}\n")

    # Add timestamp to the output file name
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"tmp/{repo_id}_digest_{timestamp}.txt"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(output_content))

    return output_path


def download_digest(file_path: str):
    """
    Download the locally generated file.
    """
    if os.path.exists(file_path):
        files.download(file_path)
    else:
        print("File not found:", file_path)


def main(
    repo_url: str,
    github_token: Optional[str],
    branch: Optional[str] = None,
    target_dir: Optional[str] = None,
    extensions: Optional[List[str]] = None,
    ignore_files: Optional[List[str]] = None,
    ignore_dirs: Optional[List[str]] = None,
):
    """
    Main Process
    """
    repo_id = repo_url.split("/")[-1].replace(".git", "").replace("/", "_")

    try:
        print("Cloning repository...")
        download_repo(repo_url, repo_id, github_token, branch)

        print("Processing repository...")
        digest_path = process_repo(
            repo_id,
            target_dir=target_dir,
            extensions=extensions,
            ignore_files=ignore_files,
            ignore_dirs=ignore_dirs,
        )

        print("Downloading digest...")
        download_digest(digest_path)

        print("Cleaning up...")
        shutil.rmtree(f"tmp/{repo_id}")

    except Exception as e:
        print("Error:", e)

In [2]:
# Delete everything berfore it runs
!rm -rf /content/tmp/

# Main
if __name__ == "__main__":
    github_token = None  # Set github access token if needed

    parameters = {
        "repo_url": "https://github.com/HirotoShioi/query-cache",
        "github_token": github_token,
        "branch": None,
        "target_dir": ["packages/**"],
        "extensions": None,
        "ignore_files": ["*.ttf", "*.png", "pnpm-lock.yaml", "*.pdf", "*.svg"],
        "ignore_dirs": None,
    }

    main(**parameters)

Cloning repository...
Processing repository...
Downloading digest...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Cleaning up...
