In [2]:
import os
import tiktoken


def codebase_to_md(directory):
    rs_files = []
    toml_files = []
    json_files = []
    md_files = []
    output_file = "output.md"
    if os.path.exists(output_file):
        os.remove(output_file)
    for root, dirs, files in os.walk(directory):
        for file in files:
            if "target" in root or "target" in file:
                continue
            if "models" in root or "models" in file:
                continue
            if ".vscode" in root or ".vscode" in file:
                continue
            if "Cargo.lock" in file:
                continue
            if file.endswith(".rs"):
                rs_files.append(os.path.join(root, file))
            elif file.endswith(".toml"):
                toml_files.append(os.path.join(root, file))
            elif file.endswith(".json"):
                json_files.append(os.path.join(root, file))
            elif file.endswith(".md"):
                md_files.append(os.path.join(root, file))

    if os.path.exists(output_file):
        os.remove(output_file)

    with open(output_file, "w", encoding="utf-8") as markdown_file:
        for file_paths, file_type in [
            (rs_files, "rust" if rs_files else None),
            (toml_files, "toml" if toml_files else None),
            (json_files, "json" if json_files else None),
            (md_files, "markdown" if md_files else None),
        ]:
            for file_path in file_paths:
                # Make sure the file isn't output.md
                if output_file in file_path:
                    continue
                markdown_file.write(f"**{file_path}**\n")
                with open(file_path, "r", encoding="utf-8") as code_file:
                    content = code_file.read()
                    markdown_file.write(f"```{file_type}\n{content}\n```\n\n")
    with open(output_file, "r", encoding="utf-8") as markdown_file:
        content = markdown_file.read()
    content = content.replace("<|endoftext|>", "")
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = len(encoding.encode(content))
    return content, tokens


directory = "."  # Replace with your folder path
content, tokens = codebase_to_md(directory)
print(f"Tokens: {tokens}")

Tokens: 54891
