## File used to fetch various info about common tokenizers found in huggingface

In [None]:
# setup your conda // venv , to be able to use necessary libraries
# make sure to login via huggingface-cli for accesing all repos

!pip install transformers

Collecting transformers
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp311-cp311-manylinux_2_17_x86_64.many

In [None]:
from transformers import AutoTokenizer, AutoConfig
import pandas as pd

# Some of the most downloaded text generating models from the Hugging Face Model Hub 
# https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads
model_names = [
    "Qwen/Qwen2.5-1.5B-Instruct",
    "meta-llama/Llama-3.1-405B",
    "openai-community/gpt2",
    "google/gemma-2-2B",
    "bigscience/bloomz-560m",
    "facebook/opt-125m",
    "distilbert/distilgpt2",
    "meta-llama/Meta-Llama-3-8B-Instruct",
    "mistralai/Mistral-7B-Instruct-v0.2",
    "microsoft/Phi-3-mini-4k-instruct"    
]

tokenizer_data = {
    "Model": [],
    "Tokenizer Type": [],
    "Vocabulary Size": [],
    "Special Tokens": [],
}

for model_name in model_names:
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        config = AutoConfig.from_pretrained(model_name)
        
        tokenizer_type = type(tokenizer).__name__
        vocab_size = tokenizer.vocab_size
        special_tokens = tokenizer.special_tokens_map
        
        tokenizer_data["Model"].append(model_name)
        tokenizer_data["Tokenizer Type"].append(tokenizer_type)
        tokenizer_data["Vocabulary Size"].append(vocab_size)
        tokenizer_data["Special Tokens"].append(special_tokens)

    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        tokenizer_data["Model"].append(model_name)
        tokenizer_data["Tokenizer Type"].append("N/A")
        tokenizer_data["Vocabulary Size"].append("N/A")
        tokenizer_data["Special Tokens"].append("N/A")

df = pd.DataFrame(tokenizer_data)
print(df)
df.to_csv("huggingface_model_tokenizers.csv", index=False)


Error loading model meta-llama/Meta-Llama-3-8B-Instruct: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct.
403 Client Error. (Request ID: Root=1-6724adaa-78356bf0768c79012831acc4;6da7a15d-fa04-4945-9540-cd76013d6705)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B-Instruct is restricted and you are not in the authorized list. Visit https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct to ask for access.
                                 Model           Tokenizer Type  \
0           Qwen/Qwen2.5-1.5B-Instruct       Qwen2TokenizerFast   
1            meta-llama/Llama-3.1-405B  PreTrainedTokenizerFast   
2                openai-community/gpt2        GPT2TokenizerFast   
3                    google/gemma-2-2B       GemmaTokenizerFast   
4               bigscience/bloomz-560m       BloomTokeni