In [12]:
import tiktoken

def count_chat_input_tokens(messages, model="gpt-4o-mini"):
    """
    Count tokens for a list of chat messages to OpenAI API,
    including role and message overheads.

    messages: List of dicts like [{"role": "user", "content": "text"}, ...]
    model: OpenAI model name to select tokenizer

    Returns: total token count (int)
    """

    enc = tiktoken.encoding_for_model(model)

    # According to OpenAI docs, every message in chat format is wrapped with:
    # <im_start>{role}\n{content}<im_end>\n
    # Which is roughly 4 tokens overhead per message, plus 3 tokens request priming.

    tokens_per_message = 4
    tokens_per_request = 3

    num_tokens = tokens_per_request
    for message in messages:
        # count tokens in role + content + overhead
        num_tokens += tokens_per_message
        num_tokens += len(enc.encode(message["content"]))
        # role is always "user" or "system" - very short, usually 1 token, but counted in overhead

    return num_tokens

# Your test prompt:
user_message = {
    "role": "user",
    "content": ("List only the valid English words from these: h8t4Wf, om, APcmdN9, MpZ33uyd, B1pXR4u, "
                "fsTFXlbim, nrClRN, Sg, GTIaS, HfSJ2, cofU1, ahG, CDRCrg3p, sJmYL, T1, gfOlYL, MY670jXG7Z, "
                "F, 5Weri, zc, vZkBo4tvO, 3ABe, sACmF2txy0, CzilMo, OgdeyOFa, Ez, cl67HT, Z, O8, 31LLka, "
                "Nxn2D99uG, S5W4e4g, L, LbfD, UqM33wdk, hqWn, Q7MB, Wx1A, kH, 41uiT0mEe, 8YVywe, 4U, o9Qo, aX, Y, "
                "gGzaHV, Asi7H9hKk3, DTbmxBCBTM, fb, ux1l0, d9, Vsz2, elnYD, tKFHatHP, wkmhiePjE, rZZDVZ, 4lHqN, "
                "zPQwF8V1, o8ARpA3, 92jNughr0, k, zW04z, Rt7LYIEC, R3M6UN, BT6yVbXqf, W02vpFSpV, ERDT, "
                "rCHXpDTEa4, m8NGa, bDpveIEZ, MfRz, T2gk, x7m, x6ZQXIco, XiF, 2UWTz, zRG8B, 5mckPeV, MS1rxoj, "
                "rFNQwARYHA, 47SidSmf, tEV, MxXR488Dm, qzXXey2T1P, ppaaHx, gFOdB6LZ, aD38EWZ")
}

messages = [user_message]

total_tokens = count_chat_input_tokens(messages)
print(f"Total input tokens (including overhead): {total_tokens}")


Total input tokens (including overhead): 466


ModuleNotFoundError: No module named 'pytesseract'