Skip to content

Commit

Permalink
llama : fix pre-tokenizers
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed May 8, 2024
1 parent e59b546 commit b7ede48
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 42 deletions.
9 changes: 5 additions & 4 deletions convert-hf-to-gguf-update.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ class TOKENIZER_TYPE(IntEnum):

# TODO: add models here, base models preferred
models = [
#{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
#{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
Expand All @@ -70,8 +70,9 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
{"name": "jina-embeddings-v2-base-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-embeddings-v2-base-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "jina-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
]

# make directory "models/tokenizers" if it doesn't exist
Expand Down
11 changes: 8 additions & 3 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,12 +322,17 @@ def get_vocab_base_pre(self, tokenizer) -> str:
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
res = "olmo"
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
# ref: https://huggingface.co/databricks/dbrx-instruct
# ref: https://huggingface.co/databricks/dbrx-base
res = "dbrx"
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
res = "jina-en"
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
res = "default"
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
res = "jina-es"
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
res = "default"
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
res = "jina-de"

if res is None:
logger.warning("\n")
Expand Down
78 changes: 43 additions & 35 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,40 +229,40 @@ enum llm_arch {
};

static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" },
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" },
{ LLM_ARCH_GPT2, "gpt2" },
{ LLM_ARCH_GPTJ, "gptj" },
{ LLM_ARCH_GPTNEOX, "gptneox" },
{ LLM_ARCH_MPT, "mpt" },
{ LLM_ARCH_BAICHUAN, "baichuan" },
{ LLM_ARCH_STARCODER, "starcoder" },
{ LLM_ARCH_PERSIMMON, "persimmon" },
{ LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" },
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2"},
{ LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" },
{ LLM_ARCH_QWEN, "qwen" },
{ LLM_ARCH_QWEN2, "qwen2" },
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
{ LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PHI3, "phi3" },
{ LLM_ARCH_PLAMO, "plamo" },
{ LLM_ARCH_CODESHELL, "codeshell" },
{ LLM_ARCH_ORION, "orion" },
{ LLM_ARCH_INTERNLM2, "internlm2" },
{ LLM_ARCH_MINICPM, "minicpm" },
{ LLM_ARCH_GEMMA, "gemma" },
{ LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_XVERSE, "xverse" },
{ LLM_ARCH_COMMAND_R, "command-r" },
{ LLM_ARCH_DBRX, "dbrx" },
{ LLM_ARCH_OLMO, "olmo" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
{ LLM_ARCH_LLAMA, "llama" },
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" },
{ LLM_ARCH_GPT2, "gpt2" },
{ LLM_ARCH_GPTJ, "gptj" },
{ LLM_ARCH_GPTNEOX, "gptneox" },
{ LLM_ARCH_MPT, "mpt" },
{ LLM_ARCH_BAICHUAN, "baichuan" },
{ LLM_ARCH_STARCODER, "starcoder" },
{ LLM_ARCH_PERSIMMON, "persimmon" },
{ LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" },
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
{ LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" },
{ LLM_ARCH_QWEN, "qwen" },
{ LLM_ARCH_QWEN2, "qwen2" },
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
{ LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PHI3, "phi3" },
{ LLM_ARCH_PLAMO, "plamo" },
{ LLM_ARCH_CODESHELL, "codeshell" },
{ LLM_ARCH_ORION, "orion" },
{ LLM_ARCH_INTERNLM2, "internlm2" },
{ LLM_ARCH_MINICPM, "minicpm" },
{ LLM_ARCH_GEMMA, "gemma" },
{ LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_XVERSE, "xverse" },
{ LLM_ARCH_COMMAND_R, "command-r" },
{ LLM_ARCH_DBRX, "dbrx" },
{ LLM_ARCH_OLMO, "olmo" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};

enum llm_kv {
Expand Down Expand Up @@ -3800,6 +3800,12 @@ static void llm_load_hparams(

// get hparams kv
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);

// everything past this point is not vocab-related
if (hparams.vocab_only) {
return;
}

ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
Expand Down Expand Up @@ -4417,7 +4423,9 @@ static void llm_load_vocab(
tokenizer_pre == "starcoder") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
} else if (
tokenizer_pre == "gpt-2") {
tokenizer_pre == "gpt-2" ||
tokenizer_pre == "jina-es" ||
tokenizer_pre == "jina-de") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "refact") {
Expand Down

0 comments on commit b7ede48

Please sign in to comment.