In [10]:
import os
import tiktoken


def codebase_to_md(directory):
    python_files = []
    other_files = []
    powershell_files = []
    js_files = []
    ts_files = []
    kt_files = []
    lua_files = []
    xml_files = []
    md_files = []
    json_files = []
    gql_files = []
    sh_files = []
    output_file = "output.md"
    if os.path.exists(output_file):
        os.remove(output_file)
    for root, dirs, files in os.walk(directory):
        for file in files:
            if "node_modules" in root or "node_modules" in file:
                continue
            if "package-lock.json" in file:
                continue
            if ".stories." in file:
                continue
            if file.endswith(".py"):
                python_files.append(os.path.join(root, file))
            elif file.endswith(".ps1"):
                powershell_files.append(os.path.join(root, file))
            elif file in [
                "Dockerfile",
                "requirements.txt",
                "static-requirements.txt",
            ] or file.endswith(".yml"):
                other_files.append(os.path.join(root, file))
            elif file.endswith(".js") or file.endswith(".jsx"):
                js_files.append(os.path.join(root, file))
            elif file.endswith(".ts") or file.endswith(".tsx"):
                ts_files.append(os.path.join(root, file))
            elif file.endswith(".kt") or file.endswith(".java"):
                kt_files.append(os.path.join(root, file))
            elif file.endswith(".lua"):
                lua_files.append(os.path.join(root, file))
            elif file.endswith(".xml"):
                # if path is app/src/main/res/layout, then we will add the xml files, but not other folders.
                if "layout" in root.split(os.path.sep):
                    xml_files.append(os.path.join(root, file))
            elif file.endswith(".md"):
                md_files.append(os.path.join(root, file))
            elif file.endswith(".json"):
                json_files.append(os.path.join(root, file))
            elif file.endswith(".gql"):
                gql_files.append(os.path.join(root, file))
            elif file.endswith(".sh"):
                sh_files.append(os.path.join(root, file))

    if os.path.exists(output_file):
        os.remove(output_file)

    with open(output_file, "w", encoding="utf-8") as markdown_file:
        for file_paths, file_type in [
            (other_files, "yaml"),
            (powershell_files, "powershell"),
            (python_files, "python"),
            (js_files, "javascript"),
            (ts_files, "typescript"),
            (kt_files, "kotlin"),
            (lua_files, "lua"),
            (xml_files, "xml"),
            (md_files, "markdown"),
            (json_files, "json"),
            (gql_files, "graphql"),
            (sh_files, "shell"),
        ]:
            for file_path in file_paths:
                # Make sure the file isn't output.md
                if output_file in file_path:
                    continue
                markdown_file.write(f"**{file_path}**\n")
                with open(file_path, "r", encoding="utf-8") as code_file:
                    content = code_file.read()
                    markdown_file.write(f"```{file_type}\n{content}\n```\n\n")
    with open(output_file, "r", encoding="utf-8") as markdown_file:
        content = markdown_file.read()
    content = content.replace("<|endoftext|>", "")
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = len(encoding.encode(content))
    return content, tokens


directory = "."  # Replace with your folder path
content, tokens = codebase_to_md(directory)
print(f"Tokens: {tokens}")

Tokens: 17335
