In [5]:
import os
import re
import tiktoken
import subprocess
import pandas as pd

encode = tiktoken.encoding_for_model("gpt-3.5-turbo").encode
count_token = lambda x: len(encode(x))

def remove_ignored_dirs(dirs, ignore_dirs):
    dirs[:] = [dir for dir in dirs if dir not in ignore_dirs]


def generate_tree(start_path: str, ignore_dirs: list, file_info_map: dict, indent_size: int = 1) -> str:
    tree_lines = []
    for root, dirs, files in os.walk(start_path):
        remove_ignored_dirs(dirs, ignore_dirs)
        level = root.replace(start_path, '').count(os.sep)
        indent = ' ' * indent_size * level
        tree_lines.append(f"{indent}{os.path.basename(root)}/")
        sub_indent = ' ' * indent_size * (level + 1)
        
        for file in files:
            relative_path = os.path.join(root, file).replace(start_path, '').lstrip("/")
            print(relative_path)
            file_info = file_info_map.get(relative_path)
            print(file_info)
            if file_info:
                tree_lines.append(
                    f"{sub_indent}{file} ({file_info['code']}, {file_info['comment']}, {file_info['blank']})"
                )
            else:
                tree_lines.append(f"{sub_indent}{file}")

    return 'Directory tree { file_name (Lines of Code (LOC): # lines Code, # lines Comments, # lines blank) } : \n\n' +'\n'.join(tree_lines[:-1])

FILE_EXTENSION_LANG_MAP = {
    ".py": "python",
    ".js": "javascript",
    ".ts": "typescript",
    ".java": "java",
    ".c": "c",
    ".cpp": "cpp",
    ".cs": "csharp",
    ".php": "php",
    ".rb": "ruby",
    ".swift": "swift",
    ".go": "go",
    ".r": "r",
    ".m": "objective-c",
    ".pl": "perl",
    ".md": "markdown",
    ".tsx": "typescript",
    ".jsx": "javascript",
    "cjs": "javascript",
}

COMMENT_SYMBOL_MAP = {
    "python": "#",
    "javascript": "//",
    "typescript": "//",
    "java": "//",
    "c": "//",
    "cpp": "//",
    "csharp": "//",
    "php": "//",
    "ruby": "#",
    "swift": "//",
    "go": "//",
    "r": "#",
    "objective-c": "//",
    "perl": "#",
    "css": "/*"
}

def get_lang_from_extension(file_extension: str) -> str:
    return FILE_EXTENSION_LANG_MAP.get(file_extension, file_extension[1:])

def process_file(root: str, file: str, relative_path: str, no_formatting: bool, print_token_counts: bool):
    lang = get_lang_from_extension(os.path.splitext(file)[-1])
    comment_symbol = COMMENT_SYMBOL_MAP.get(lang, "//")

    try:
        with open(os.path.join(root, file), "r", encoding='utf-8') as infile:
            data = infile.read()
    except UnicodeDecodeError:
        print(f"Error reading {relative_path}")
        return None

    if no_formatting:
        comment_pattern = re.escape(comment_symbol) + '.*$'
        data = re.sub(comment_pattern, ' ', data, flags=re.MULTILINE)
        data = re.sub(r'(?<=\n)[ \t]*(?=\n)', '', data)
        # remove all newlines
        data = re.sub(r'\n', ' ', data)
        data = re.sub(r';+', '; ', data)
        data = re.sub(r' +', ' ', data)

    token_count = count_token(data) + count_token(relative_path) if print_token_counts else 0

    return {
        "path": relative_path,
        "token_count": token_count,
        "data": data,
        "comment_symbol": comment_symbol,
        "lang": lang
    }

def walk_files(config: dict, data: dict):
    cwd = os.getcwd()
    outputs = []

    zero_target_files = not config["target_files"]
    zero_allowed_extensions = not config["allowed_extensions"]

    for root, dirs, files in os.walk(config['path']):
        remove_ignored_dirs(dirs, config['ignore_dirs'])

        for file in files:
            if zero_allowed_extensions or (file.endswith(config['allowed_extensions']) and not file.endswith(config['ignored_extensions'])):
                if file in config['ignore_files']:
                    continue
                if zero_target_files or file in config['target_files']:
                    relative_path = f'{root.replace(cwd, "")}/{file}'
                    if relative_path[0] == '/':
                        relative_path = relative_path[1:]

                    output = process_file(root, file, relative_path, config['no_formatting'], config['print_token_counts'])
                    if output and output['data']:
                        outputs.append(output)

    outputs = sorted(outputs, key=lambda x: os.path.splitext(x['path'])[-1])
    sept = '\n\n---\n\n'
    tree = generate_tree(config['path'], config['ignore_dirs'], file_info_map, indent_size=2) + sept
    generate_output_content = lambda x: f"```{x['lang']}\n{x['comment_symbol']} {x['path']}\n{x['data']}\n```"

    if config['print_token_counts']:
        out_string = tree + sept.join([f"count: {x['token_count']}\n{generate_output_content(x)}" for x in outputs])
    else:
        out_string = tree + sept.join([generate_output_content(x) for x in outputs])

    total_tokens = count_token(out_string)

    with open(config['output_file'], "w", encoding='utf-8') as outfile:
        outfile.write(f'total: {total_tokens}\n\n{out_string}')

config = {
    'path': "BGOV_repos/bgov-ga-mybgov", #os.getcwd(), #"src/components/game", #
    'output_file': "output.txt",
    'target_files': [], # [],
    'allowed_extensions': (), 
    'ignored_extensions': (),
    'print_token_counts': True,
    'no_formatting': True,
    'ignore_dirs': ["migrations", "favicon", ".husky", ".next", "node_modules", ".git", ".vscode", "__pycache__", "old", "test_page", "dist"],
    'ignore_files': ["example.ts", "output.txt", "yarn.lock", "package-lock.json", "package.json", "tsconfig.json", "run.ipynb", "tailwind.config.cjs"],
}

cwd = os.getcwd()

completed_process = subprocess.run(
    ["cloc", "./", "--by-file", "--csv", "--quiet"],
    check=True,
    text=True,
    capture_output=True,
    cwd=config["path"]
)

file_info = completed_process.stdout
file_info_df = pd.DataFrame([line.split(",") for line in file_info.split("\n")])
file_info_df.columns = file_info_df.iloc[0]
file_info_df = file_info_df[1:-2]
file_info_df = file_info_df[["language", "filename", "blank", "comment", "code"]]
file_info_df["blank"] = file_info_df["blank"].astype(int)
file_info_df["comment"] = file_info_df["comment"].astype(int)
file_info_df["code"] = file_info_df["code"].astype(int)
file_info_df["filename"] = file_info_df["filename"].apply(lambda x: x.replace("./", ""))
file_info_df.sort_values(by=["code"], ascending=False, inplace=True)
file_info = file_info_df.to_dict(orient="records")
file_info_map = {x["filename"]: x for x in file_info}

walk_files(config, file_info_map)
os.system("code output.txt")

Error reading BGOV_repos/bgov-ga-mybgov/public/favicon.ico
.db-migraterc
None
template.yml
{'language': 'YAML', 'filename': 'template.yml', 'blank': 54, 'comment': 39, 'code': 1198}
nodemon.json
{'language': 'JSON', 'filename': 'nodemon.json', 'blank': 0, 'comment': 0, 'code': 14}
schema.graphql
{'language': 'GraphQL', 'filename': 'schema.graphql', 'blank': 47, 'comment': 2, 'code': 376}
Dockerfile
{'language': 'Dockerfile', 'filename': 'Dockerfile', 'blank': 10, 'comment': 11, 'code': 13}
buildspec.yml
{'language': 'YAML', 'filename': 'buildspec.yml', 'blank': 1, 'comment': 0, 'code': 21}
set_env.ksh
{'language': 'Korn Shell', 'filename': 'set_env.ksh', 'blank': 0, 'comment': 0, 'code': 12}
jest.config.js
{'language': 'JavaScript', 'filename': 'jest.config.js', 'blank': 0, 'comment': 0, 'code': 12}
newrelic.js
{'language': 'JavaScript', 'filename': 'newrelic.js', 'blank': 0, 'comment': 31, 'code': 23}
webpack.config.js
{'language': 'JavaScript', 'filename': 'webpack.config.js', 'blank

sh: code: command not found


32512