In [None]:
# 1.安装 OpenAI 提供的 tiktoken 包和 git python 包

In [None]:
!pip install tiktoken gitpython

# 2.下载指定 Git 仓库代码并统计其 token 数

In [None]:
import os
import tiktoken
import git
from pathlib import Path
import json

DEFAULT_FILE_TYPES = [".py", ".js", ".java", ".cpp", ".html", ".css"]
CONFIG_FILE = "token_counter_config.json"

def load_config():
    if os.path.exists(CONFIG_FILE):
        with open(CONFIG_FILE, 'r') as f:
            return json.load(f)
    return {"file_types": DEFAULT_FILE_TYPES}

def save_config(config):
    with open(CONFIG_FILE, 'w') as f:
        json.dump(config, f, indent=2)

def clone_repo(repo_url, local_path):
    git.Repo.clone_from(repo_url, local_path)

#def count_tokens(file_path, encoding_name="cl100k_base"):
def count_tokens(file_path, encoding_name="o200k_base"):
    encoding = tiktoken.get_encoding(encoding_name)
    #encoding = tiktoken.encoding_for_model(encoding_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return len(encoding.encode(content))

def process_repository(repo_path, file_types):
    total_tokens = 0
    for root, dirs, files in os.walk(repo_path):
        for file in files:
            if any(file.endswith(ext) for ext in file_types):
                file_path = os.path.join(root, file)
                tokens = count_tokens(file_path)
                total_tokens += tokens
                print(f"{file_path}: {tokens} tokens")
    return total_tokens

def main():
    config = load_config()

    print(f"当前配置的文件类型: {', '.join(config['file_types'])}")
    change_config = input("是否要修改文件类型配置? (y/n): ").lower() == 'y'

    if change_config:
        new_types = input("请输入新的文件类型,用逗号分隔 (例如: .py,.js,.java): ").split(',')
        config['file_types'] = [t.strip() for t in new_types if t.strip()]
        save_config(config)
        print(f"配置已更新: {', '.join(config['file_types'])}")

    repo_url = input("请输入 GitHub 仓库 URL: ")
    local_path = "./temp_repo"

    try:
        print("正在克隆仓库...")
        clone_repo(repo_url, local_path)

        print("正在计算 token 数...")
        total_tokens = process_repository(local_path, config['file_types'])

        print(f"\n仓库中所有配置的代码文件的总 token 数: {total_tokens}")
    finally:
        # 清理临时文件
        if os.path.exists(local_path):
            import shutil
            shutil.rmtree(local_path)

if __name__ == "__main__":
    main()