In [36]:
import os
import re

import tiktoken

path = "gateway/bloomberg/blaw/llm_gateway_client"

comment_symbol = "#"
lang = "python"

ignore_dirs = ["node_modules", ".git", ".vscode", "__pycache__"]
ignore_files = ["package-lock.json"]
allowed_extensions = (".ts", ".tsx", ".js",".jsx", ".json", ".cjs", ".py")
ignored_extensions = ()

print_token_counts = False

target_files = []
# target_files = []



encode = tiktoken.encoding_for_model("gpt-3.5-turbo").encode
count_token = lambda x: len(encode(x))

def generate_tree(start_path, ignore_dirs = ignore_dirs):
    tree = ""
    for root, dirs, files in os.walk(start_path):
        # ignore dirs
        for ignore_dir in ignore_dirs:
            if ignore_dir in dirs:
                dirs.remove(ignore_dir)
            
        level = root.replace(start_path, '').count(os.sep)
        indent = ' ' * 1 * (level)
        tree += f"{indent}{os.path.basename(root)}/\n"
        sub_indent = ' ' * 1 * (level + 1)
        for f in files:
            tree += f"{sub_indent}{f}\n"
    return tree[:-1]

def walk_files(path = "", target_files=target_files, output_file = "output.txt", lang="typescript", ignore_dirs = ignore_dirs, print_token_counts = True, ignore_files = ignore_files, comment_symbol = "", allowed_extensions = allowed_extensions, ignored_extensions = ignored_extensions):
    if path == "": path = os.getcwd()
    if comment_symbol == "": comment_symbol = "//"

    zero_target_files = target_files == []

    cwd = os.getcwd()
    outputs = []

    for root, dirs, files in os.walk(path):
        for ignore_dir in ignore_dirs:
            if ignore_dir in dirs:
                dirs.remove(ignore_dir)

        for file in files:
            if file.endswith(allowed_extensions) and not file.endswith(ignored_extensions):
                if file in ignore_files:
                    continue
                if zero_target_files or file in target_files:
                    with open(os.path.join(root, file), "r") as infile:
                        relative_path = f'{root.replace(cwd, "")}/{file}'
                        # if the relative path starts with a /, remove it
                        if relative_path[0] == '/':
                            relative_path = relative_path[1:]

                        data = infile.read()
                        comment_pattern = re.escape(comment_symbol) + '.*$'
                        data = re.sub(comment_pattern, ' ', data, flags=re.MULTILINE)
                        data = re.sub( r'(?<=\n)[ \t]+(?=\n)', '', data)
                        data = re.sub('\n+', '<newline>', data)

                        data = data.replace(';', '; ')
                        data = re.sub(r' +', ' ', data)

                        if print_token_counts:
                            token_count = count_token(data) + count_token(relative_path)
                        else:
                            token_count = 0
                                            
                        output = {
                            "path": relative_path,
                            "token_count": token_count,
                            "data": data
                        }

                        if len(data) > 0:
                            outputs.append(output)

    
    outputs = sorted(outputs, key=lambda x: x['path'].split('.')[-1])
    sept = '\n\n---\n\n'
    tree = generate_tree(path) + sept

    total_token_count = count_token(''.join([x['data'] for x in outputs]) + tree)

    # output_content = f'```{lang}\n{data}\n```'
    generate_output_content = lambda x: f"```{lang}\n{comment_symbol} {x['path']}\n{x['data']}\n```"

    if print_token_counts:
        out_string = tree + sept.join([f"{x['token_count']}\n{generate_output_content(x)}" for x in outputs])
    else:
        out_string = tree + sept.join([generate_output_content(x) for x in outputs])

    with open(output_file, "w") as outfile:
        outfile.write(f'Total token count: {total_token_count}\n\n{out_string}')

walk_files(path, print_token_counts = print_token_counts, comment_symbol = comment_symbol, lang = lang)
os.system("code output.txt")


0