Remap <|eot_id|> → <|end_of_text|>

This fixes meta llama3 70b instruct, however it doesn't seem to help with llama3 8b instruct.
Mozilla-Ocho · Apr 19, 2024 · da4d780 · da4d780
1 parent ff9decc
commit da4d780
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 1 deletion.
diff --git a/llama.cpp/llama.cpp b/llama.cpp/llama.cpp
@@ -14593,3 +14593,12 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
     fputs(text, stderr);
     fflush(stderr);
 }
+
+llama_token llama_string_to_token(const struct llama_model *model, const char *str) {
+    auto i = model->vocab.token_to_id.find(str);
+    if (i != model->vocab.token_to_id.end()) {
+        return i->second;
+    } else {
+        return -1;
+    }
+}
diff --git a/llama.cpp/llama.h b/llama.cpp/llama.h
@@ -979,6 +979,8 @@ extern "C" {
 
     LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
 
+    LLAMA_API llama_token llama_string_to_token(const struct llama_model *, const char *);
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/llama.cpp/sampling.cpp b/llama.cpp/sampling.cpp
@@ -382,7 +382,15 @@ llama_token llama_sampling_sample(
                   struct llama_context * ctx_cfg,
                   const int idx) {
     // Call the implementation function with is_resampling set to false by default
-    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
+    llama_token tok = llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
+
+    // [jart] llama3 <|eot_id|> → <|end_of_text|>
+    //        https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2064950897
+    if (tok == llama_string_to_token(llama_get_model(ctx_main), "<|eot_id|>")) {
+        tok = llama_token_eos(llama_get_model(ctx_main));
+    }
+
+    return tok;
 }
 
 llama_token_data_array llama_sampling_probability_distribution(