Sync with llama.cpp upstream

Mozilla-Ocho · Jan 10, 2024 · 133b05e · 133b05e
1 parent a193fd5
commit 133b05e
Show file tree

Hide file tree

Showing 31 changed files with 1,497 additions and 375 deletions.
diff --git a/build/config.mk b/build/config.mk
@@ -14,8 +14,8 @@ INSTALL = install
 
 ARFLAGS = rcsD
 CCFLAGS = -g -O3 -fexceptions
-CPPFLAGS_ = -iquote. -mcosmo
 TARGET_ARCH = -Xx86_64-mssse3
+CPPFLAGS_ = -iquote. -mcosmo -DGGML_MULTIPLATFORM
 
 TMPDIR = o//tmp
 IGNORE := $(shell mkdir -p $(TMPDIR))

diff --git a/llama.cpp/BUILD.mk b/llama.cpp/BUILD.mk
@@ -24,6 +24,8 @@ include llama.cpp/main/BUILD.mk
 include llama.cpp/quantize/BUILD.mk
 include llama.cpp/perplexity/BUILD.mk
 
+$(LLAMA_CPP_OBJS): private CCFLAGS += -DGGML_MULTIPLATFORM
+
 .PHONY: o/$(MODE)/llama.cpp
 o/$(MODE)/llama.cpp: 					\
 		o/$(MODE)/llama.cpp/main		\

diff --git a/llama.cpp/README.llamafile b/llama.cpp/README.llamafile
@@ -9,8 +9,8 @@ LICENSE
 ORIGIN
 
   https://github.com/ggerganov/llama.cpp/pull/4406/
-  b3a7c20b5c035250257d2b62851c379b159c899a
-  2024-01-04
+  4f56458d34cb13dcbf69aca650e9bf77d5497e6f
+  2024-01-10
 
 LOCAL MODIFICATIONS
 

diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
@@ -235,6 +235,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.n_ctx = std::stoi(argv[i]);
+        } else if (arg == "--grp-attn-n" || arg == "-gan") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+
+            params.grp_attn_n = std::stoi(argv[i]);
+        } else if (arg == "--grp-attn-w" || arg == "-gaw") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+
+            params.grp_attn_w = std::stoi(argv[i]);
         } else if (arg == "--rope-freq-base") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -662,7 +676,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             }
             std::stringstream ss(argv[i]);
             llama_token key;
-            char sign = 0;
+            char sign;
             std::string value_str;
             try {
                 if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
@@ -817,6 +831,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("\n");
     printf("options:\n");
     printf("  -h, --help            show this help message and exit\n");
+    printf("      --version         show version and build info\n");
     printf("  -i, --interactive     run in interactive mode\n");
     printf("  --interactive-first   run in interactive mode and wait for input right away\n");
     printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
@@ -908,16 +923,24 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --numa                attempt optimizations that help on some NUMA systems\n");
     printf("                        if run without this previously, it is recommended to drop the system page cache before using this\n");
     printf("                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
     printf("  -ngl N, --n-gpu-layers N\n");
     printf("                        number of layers to store in VRAM\n");
     printf("  -ngld N, --n-gpu-layers-draft N\n");
     printf("                        number of layers to store in VRAM for the draft model\n");
     printf("  -ts SPLIT --tensor-split SPLIT\n");
     printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
     printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+#ifdef GGML_USE_CUBLAS
     printf("  -nommq, --no-mul-mat-q\n");
     printf("                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
     printf("                        Not recommended since this is both slower and uses more VRAM.\n");
+#endif // GGML_USE_CUBLAS
+#endif
+    printf("  -gan N, --grp-attn-n N\n");
+    printf("                        group-attention factor (default: %d)\n", params.grp_attn_n);
+    printf("  -gaw N, --grp-attn-w N\n");
+    printf("                        group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
     printf("  --verbose-prompt      print prompt before generation\n");
     printf("  -dkvc, --dump-kv-cache\n");
     printf("                        verbose print of the KV cache\n");
@@ -934,7 +957,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -m FNAME, --model FNAME\n");
     printf("                        model path (default: %s)\n", params.model.c_str());
     printf("  -md FNAME, --model-draft FNAME\n");
-    printf("                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
+    printf("                        draft model for speculative decoding\n");
     printf("  -ld LOGDIR, --logdir LOGDIR\n");
     printf("                        path under which to save YAML logs (no logging if unset)\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
@@ -1327,7 +1350,7 @@ std::string get_sortable_timestamp() {
 
     const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
         current_time.time_since_epoch() % 1000000000).count();
-    char timestamp_ns[21];
+    char timestamp_ns[11];
     snprintf(timestamp_ns, 11, "%09" PRId64, ns);
 
     return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);

diff --git a/llama.cpp/common.h b/llama.cpp/common.h
@@ -74,6 +74,8 @@ struct gpt_params {
     int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};   // how split tensors should be distributed across GPUs
     int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
+    int32_t grp_attn_n                      = 1;     // group-attention factor
+    int32_t grp_attn_w                      = 512;   // group-attention width
     float   rope_freq_base                  = 0.0f;  // RoPE base frequency
     float   rope_freq_scale                 = 0.0f;  // RoPE frequency scaling factor
     float   yarn_ext_factor                 = -1.0f; // YaRN extrapolation mix factor
@@ -254,3 +256,4 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
 
 // Dump the KV cache view showing individual sequences in each cell (long output).
 void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+
diff --git a/llama.cpp/ggml-backend-impl.h b/llama.cpp/ggml-backend-impl.h
@@ -4,7 +4,6 @@
 
 // ggml-backend internal header
 
-#include "ggml.h"
 #include "ggml-backend.h"
 
 #ifdef  __cplusplus
@@ -17,6 +16,7 @@ extern "C" {
 
     // buffer type
     typedef void * ggml_backend_buffer_type_context_t;
+
     struct ggml_backend_buffer_type_i {
         ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
         size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment

diff --git a/llama.cpp/ggml-backend.c b/llama.cpp/ggml-backend.c
@@ -201,11 +201,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
     ggml_backend_synchronize(backend);
 }
 
-void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    backend->iface.graph_compute(backend, cgraph);
+bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    if (!backend->iface.graph_compute(backend, cgraph)) {
+        return false;
+    }
 
     // TODO: optional sync
     ggml_backend_synchronize(backend);
+    return true;
 }
 
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -283,7 +286,7 @@ GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void *
     GGML_UNUSED(user_data);
 }
 
-static void ggml_backend_registry_init(void) {
+GGML_CALL static void ggml_backend_registry_init(void) {
     static bool initialized = false;
 
     if (initialized) {

diff --git a/llama.cpp/ggml-backend.h b/llama.cpp/ggml-backend.h
@@ -13,7 +13,6 @@ extern "C" {
     typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
     typedef struct ggml_backend * ggml_backend_t;
     typedef void * ggml_backend_graph_plan_t;
-    struct ggml_backend_api;
 
     //
     // Backend buffer
@@ -61,7 +60,7 @@ extern "C" {
 
     GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
     GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
     GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
 
     // tensor copy between different backends
@@ -188,6 +187,7 @@ extern "C" {
     //
     // dynamic shared object api
     //
+    struct ggml_backend_api;
     const struct ggml_backend_api *ggml_backend_api(void);
 
 #ifdef  __cplusplus