# Setup

In [25]:
%%capture
%pip install datasets
%pip install transformers
%pip install tokenizers
%pip install tqdm
%pip install pandas
%pip install ipywidgets
%pip install torch
%pip install tiktoken

## Imports and Constants

In [26]:
import os
import tempfile
import subprocess
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer

REPO_URL = "https://github.com/sybaik1/CodeComplex-Data.git"
REPO_DIR = "CodeComplex-Data"

# Models I would run if I had $3000
# MODELS = {
#     "zai-org/GLM-5",
#     "moonshotai/Kimi-K2.5",
#     "MiniMaxAI/MiniMax-M2.5",
#     "stepfun-ai/Step-3.5-Flash",
#     "zai-org/GLM-4.7",
#     "XiaomiMiMo/MiMo-V2-Flash",
#     "deepseek-ai/DeepSeek-V3.2-Speciale",
# }

MODELS = {
    "Qwen/Qwen3-8B", # 8B params
    "Qwen/Qwen2.5-Coder-7B-Instruct", # 7B params
    "deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", # 8B params
    "deepseek-ai/deepseek-coder-6.7b-instruct", # 6.7B params
    "deepseek-ai/deepseek-coder-1.3b-instruct", # 1.3B params
    "bigcode/starcoder2-7b", # 7B params
    "meta-llama/Llama-3.1-8B-Instruct", # 8B params
    "meta-llama/Llama-3.2-3B-Instruct", # 3B params
    "openai/gpt-oss-20b", # 20B params
    # "meta-llama/CodeLlama-7b-Instruct-hf", # 7B params
}

SYSTEM_PROMPT = """You are a Software Engineer that is meant to analyze the time complexity of a given code snippet. You will be given a code snippet in Java or Python, and you will analyze the time complexity of the snippet. You're output must be in the form of a JSON with the following format:
{{
    "time_complexity": "[Your time complexity class guess here]",
    "explanation": "[Brief 20 word explanation of your guess]"
}}
Your time complexity class guess must be one of the following: "constant", "logn", "linear", "nlogn", "quadratic", "cubic", or "exponential". Additionally, your explanation must be brief and concise, and should NOT EXCEED 20 WORDS.
DO NOT INCLUDE ANYTHING OTHER THAN THE JSON IN YOUR RESPONSE.
"""

# Importing dataset (CodeComplex)
GitHub Repo: https://github.com/sybaik1/CodeComplex-Data.git

In [27]:
JSONL_FILES = {
    "java": os.path.join(REPO_DIR, "java_data.jsonl"),
    "python": os.path.join(REPO_DIR, "python_data.jsonl"),
}

# Sanity Check
missing = [k for k, p in JSONL_FILES.items() if not os.path.exists(p)]
if missing:
    raise FileNotFoundError(f"Missing JSONL files for: {', '.join(missing)}. Please ensure the repository is cloned and files are in place.")

dataset_dict = load_dataset(
    "json",
    data_files=JSONL_FILES,
)

total_examples = sum(len(ds) for ds in dataset_dict.values())
print(f"Loaded splits: {list(dataset_dict.keys())} with total examples: {total_examples}")

Loaded splits: ['java', 'python'] with total examples: 9800


# Tokenize and Count

In [28]:
def get_tokenizer(model_name):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True)
        if tokenizer.pad_token is None and tokenizer.eos_token is not None:
            tokenizer.pad_token = tokenizer.eos_token
        return tokenizer
    except Exception as e:
        print(f"Unable to load tokenizer for {model_name}, falling back to GPT2 tokenizer.\nError: {e}")
        tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True, trust_remote_code=True)
        if tokenizer.pad_token is None and tokenizer.eos_token is not None:
            tokenizer.pad_token = tokenizer.eos_token
        return tokenizer

def extract_code_snippet(example):
    val = example.get("src")
    if isinstance(val, str) and val.strip():
        return val
    return ""

In [29]:
results = []

for model_name in MODELS:
    print(f"\n=== Processing model: {model_name} ===")
    tokenizer = get_tokenizer(model_name)

    token_counts = []
    skipped = 0

    for ds in dataset_dict.values():
        for example in tqdm(ds, desc=f"Tokenizing for {model_name}", total=len(ds)):
            code_snippet = extract_code_snippet(example)
            if not code_snippet:
                skipped += 1
                continue

            prompt = SYSTEM_PROMPT + "\n\n" + code_snippet
            input_ids = tokenizer.encode(prompt, add_special_tokens=False)
            token_counts.append(len(input_ids))

    if not token_counts:
        print(f"No valid code snippets found for {model_name}. Skipping.")
        continue

    total = int(sum(token_counts))
    avg = float(total / len(token_counts)) if token_counts else 0

    results.append({
        "model": model_name,
        "total_tokens": total,
        "avg_tokens": avg,
        "skipped": skipped,
        "counted": len(token_counts)
    })


=== Processing model: meta-llama/Llama-3.1-8B-Instruct ===
Unable to load tokenizer for meta-llama/Llama-3.1-8B-Instruct, falling back to GPT2 tokenizer.
Error: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct.
401 Client Error. (Request ID: Root=1-69931f0d-35accf8e7df4ecf67fae66ff;868269f0-052e-4dfa-9f81-0072bf32134d)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in.


Tokenizing for meta-llama/Llama-3.1-8B-Instruct:   0%|          | 0/4900 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1383 > 1024). Running this sequence through the model will result in indexing errors
Tokenizing for meta-llama/Llama-3.1-8B-Instruct: 100%|██████████| 4900/4900 [00:05<00:00, 912.25it/s] 
Tokenizing for meta-llama/Llama-3.1-8B-Instruct: 100%|██████████| 4900/4900 [00:02<00:00, 1904.11it/s]



=== Processing model: Qwen/Qwen2.5-Coder-7B-Instruct ===


Tokenizing for Qwen/Qwen2.5-Coder-7B-Instruct:  84%|████████▎ | 4100/4900 [00:03<00:00, 819.77it/s] Token indices sequence length is longer than the specified maximum sequence length for this model (48562 > 32768). Running this sequence through the model will result in indexing errors
Tokenizing for Qwen/Qwen2.5-Coder-7B-Instruct: 100%|██████████| 4900/4900 [00:05<00:00, 979.42it/s]
Tokenizing for Qwen/Qwen2.5-Coder-7B-Instruct: 100%|██████████| 4900/4900 [00:02<00:00, 1910.89it/s]



=== Processing model: openai/gpt-oss-20b ===


Tokenizing for openai/gpt-oss-20b: 100%|██████████| 4900/4900 [00:04<00:00, 1021.34it/s]
Tokenizing for openai/gpt-oss-20b: 100%|██████████| 4900/4900 [00:02<00:00, 2069.25it/s]



=== Processing model: meta-llama/Llama-3.2-3B-Instruct ===
Unable to load tokenizer for meta-llama/Llama-3.2-3B-Instruct, falling back to GPT2 tokenizer.
Error: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct.
401 Client Error. (Request ID: Root=1-69931f27-3d66a6023e3fc4ed714f6f01;b0dc375f-2d4d-4ce9-9ed6-48f173801a64)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-3B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in.


Tokenizing for meta-llama/Llama-3.2-3B-Instruct:   0%|          | 0/4900 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1383 > 1024). Running this sequence through the model will result in indexing errors
Tokenizing for meta-llama/Llama-3.2-3B-Instruct: 100%|██████████| 4900/4900 [00:05<00:00, 895.77it/s] 
Tokenizing for meta-llama/Llama-3.2-3B-Instruct: 100%|██████████| 4900/4900 [00:02<00:00, 1873.25it/s]



=== Processing model: deepseek-ai/deepseek-coder-6.7b-instruct ===


Tokenizing for deepseek-ai/deepseek-coder-6.7b-instruct:  62%|██████▏   | 3037/4900 [00:04<00:03, 479.43it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (20069 > 16384). Running this sequence through the model will result in indexing errors
Tokenizing for deepseek-ai/deepseek-coder-6.7b-instruct: 100%|██████████| 4900/4900 [00:09<00:00, 520.96it/s]
Tokenizing for deepseek-ai/deepseek-coder-6.7b-instruct: 100%|██████████| 4900/4900 [00:04<00:00, 1131.21it/s]



=== Processing model: deepseek-ai/DeepSeek-R1-0528-Qwen3-8B ===


Unrecognized keys in `rope_parameters` for 'rope_type'='yarn': {'attn_factor'}
Tokenizing for deepseek-ai/DeepSeek-R1-0528-Qwen3-8B: 100%|██████████| 4900/4900 [00:05<00:00, 937.00it/s] 
Tokenizing for deepseek-ai/DeepSeek-R1-0528-Qwen3-8B: 100%|██████████| 4900/4900 [00:02<00:00, 1835.11it/s]



=== Processing model: Qwen/Qwen3-8B ===


Tokenizing for Qwen/Qwen3-8B: 100%|██████████| 4900/4900 [00:05<00:00, 940.46it/s] 
Tokenizing for Qwen/Qwen3-8B: 100%|██████████| 4900/4900 [00:02<00:00, 1848.18it/s]



=== Processing model: bigcode/starcoder2-7b ===


Tokenizing for bigcode/starcoder2-7b: 100%|██████████| 4900/4900 [00:05<00:00, 939.68it/s] 
Tokenizing for bigcode/starcoder2-7b: 100%|██████████| 4900/4900 [00:02<00:00, 1895.79it/s]



=== Processing model: deepseek-ai/deepseek-coder-1.3b-instruct ===


Tokenizing for deepseek-ai/deepseek-coder-1.3b-instruct:  62%|██████▏   | 3037/4900 [00:04<00:03, 495.85it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (20069 > 16384). Running this sequence through the model will result in indexing errors
Tokenizing for deepseek-ai/deepseek-coder-1.3b-instruct: 100%|██████████| 4900/4900 [00:09<00:00, 531.56it/s]
Tokenizing for deepseek-ai/deepseek-coder-1.3b-instruct: 100%|██████████| 4900/4900 [00:04<00:00, 1165.01it/s]


In [30]:
print("\n=== Final Results ===")
for result in results:
    print(f"Model: {result['model']}, Total Tokens: {result['total_tokens']}, Average Tokens: {result['avg_tokens']}, Skipped: {result['skipped']}, Counted: {result['counted']}")


=== Final Results ===
Model: meta-llama/Llama-3.1-8B-Instruct, Total Tokens: 11772544, Average Tokens: 1201.28, Skipped: 0, Counted: 9800
Model: Qwen/Qwen2.5-Coder-7B-Instruct, Total Tokens: 6840326, Average Tokens: 697.9924489795918, Skipped: 0, Counted: 9800
Model: openai/gpt-oss-20b, Total Tokens: 6869918, Average Tokens: 701.0120408163265, Skipped: 0, Counted: 9800
Model: meta-llama/Llama-3.2-3B-Instruct, Total Tokens: 11772544, Average Tokens: 1201.28, Skipped: 0, Counted: 9800
Model: deepseek-ai/deepseek-coder-6.7b-instruct, Total Tokens: 8760083, Average Tokens: 893.8860204081633, Skipped: 0, Counted: 9800
Model: deepseek-ai/DeepSeek-R1-0528-Qwen3-8B, Total Tokens: 6840326, Average Tokens: 697.9924489795918, Skipped: 0, Counted: 9800
Model: Qwen/Qwen3-8B, Total Tokens: 6840326, Average Tokens: 697.9924489795918, Skipped: 0, Counted: 9800
Model: bigcode/starcoder2-7b, Total Tokens: 7667566, Average Tokens: 782.404693877551, Skipped: 0, Counted: 9800
Model: deepseek-ai/deepseek-c

In [31]:
output_df = pd.DataFrame(results)
output_df.to_csv("token_counts.csv", index=False)
output_df.head(7)

Unnamed: 0,model,total_tokens,avg_tokens,skipped,counted
0,meta-llama/Llama-3.1-8B-Instruct,11772544,1201.28,0,9800
1,Qwen/Qwen2.5-Coder-7B-Instruct,6840326,697.992449,0,9800
2,openai/gpt-oss-20b,6869918,701.012041,0,9800
3,meta-llama/Llama-3.2-3B-Instruct,11772544,1201.28,0,9800
4,deepseek-ai/deepseek-coder-6.7b-instruct,8760083,893.88602,0,9800
5,deepseek-ai/DeepSeek-R1-0528-Qwen3-8B,6840326,697.992449,0,9800
6,Qwen/Qwen3-8B,6840326,697.992449,0,9800
