Skip to content

Commit

Permalink
Sync with llama.cpp upstream
Browse files Browse the repository at this point in the history
  • Loading branch information
jart committed Jan 10, 2024
1 parent a193fd5 commit 133b05e
Show file tree
Hide file tree
Showing 31 changed files with 1,497 additions and 375 deletions.
2 changes: 1 addition & 1 deletion build/config.mk
Expand Up @@ -14,8 +14,8 @@ INSTALL = install

ARFLAGS = rcsD
CCFLAGS = -g -O3 -fexceptions
CPPFLAGS_ = -iquote. -mcosmo
TARGET_ARCH = -Xx86_64-mssse3
CPPFLAGS_ = -iquote. -mcosmo -DGGML_MULTIPLATFORM

TMPDIR = o//tmp
IGNORE := $(shell mkdir -p $(TMPDIR))
Expand Down
2 changes: 2 additions & 0 deletions llama.cpp/BUILD.mk
Expand Up @@ -24,6 +24,8 @@ include llama.cpp/main/BUILD.mk
include llama.cpp/quantize/BUILD.mk
include llama.cpp/perplexity/BUILD.mk

$(LLAMA_CPP_OBJS): private CCFLAGS += -DGGML_MULTIPLATFORM

.PHONY: o/$(MODE)/llama.cpp
o/$(MODE)/llama.cpp: \
o/$(MODE)/llama.cpp/main \
Expand Down
4 changes: 2 additions & 2 deletions llama.cpp/README.llamafile
Expand Up @@ -9,8 +9,8 @@ LICENSE
ORIGIN

https://github.com/ggerganov/llama.cpp/pull/4406/
b3a7c20b5c035250257d2b62851c379b159c899a
2024-01-04
4f56458d34cb13dcbf69aca650e9bf77d5497e6f
2024-01-10

LOCAL MODIFICATIONS

Expand Down
29 changes: 26 additions & 3 deletions llama.cpp/common.cpp
Expand Up @@ -235,6 +235,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break;
}
params.n_ctx = std::stoi(argv[i]);
} else if (arg == "--grp-attn-n" || arg == "-gan") {
if (++i >= argc) {
invalid_param = true;
break;
}

params.grp_attn_n = std::stoi(argv[i]);
} else if (arg == "--grp-attn-w" || arg == "-gaw") {
if (++i >= argc) {
invalid_param = true;
break;
}

params.grp_attn_w = std::stoi(argv[i]);
} else if (arg == "--rope-freq-base") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -662,7 +676,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
}
std::stringstream ss(argv[i]);
llama_token key;
char sign = 0;
char sign;
std::string value_str;
try {
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
Expand Down Expand Up @@ -817,6 +831,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf("\n");
printf("options:\n");
printf(" -h, --help show this help message and exit\n");
printf(" --version show version and build info\n");
printf(" -i, --interactive run in interactive mode\n");
printf(" --interactive-first run in interactive mode and wait for input right away\n");
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
Expand Down Expand Up @@ -908,16 +923,24 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --numa attempt optimizations that help on some NUMA systems\n");
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
printf(" -ngl N, --n-gpu-layers N\n");
printf(" number of layers to store in VRAM\n");
printf(" -ngld N, --n-gpu-layers-draft N\n");
printf(" number of layers to store in VRAM for the draft model\n");
printf(" -ts SPLIT --tensor-split SPLIT\n");
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
#ifdef GGML_USE_CUBLAS
printf(" -nommq, --no-mul-mat-q\n");
printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif // GGML_USE_CUBLAS
#endif
printf(" -gan N, --grp-attn-n N\n");
printf(" group-attention factor (default: %d)\n", params.grp_attn_n);
printf(" -gaw N, --grp-attn-w N\n");
printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
printf(" --verbose-prompt print prompt before generation\n");
printf(" -dkvc, --dump-kv-cache\n");
printf(" verbose print of the KV cache\n");
Expand All @@ -934,7 +957,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
printf(" -md FNAME, --model-draft FNAME\n");
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
printf(" draft model for speculative decoding\n");
printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
Expand Down Expand Up @@ -1327,7 +1350,7 @@ std::string get_sortable_timestamp() {

const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
current_time.time_since_epoch() % 1000000000).count();
char timestamp_ns[21];
char timestamp_ns[11];
snprintf(timestamp_ns, 11, "%09" PRId64, ns);

return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
Expand Down
3 changes: 3 additions & 0 deletions llama.cpp/common.h
Expand Up @@ -74,6 +74,8 @@ struct gpt_params {
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width.
int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
Expand Down Expand Up @@ -254,3 +256,4 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);

// Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

2 changes: 1 addition & 1 deletion llama.cpp/ggml-backend-impl.h
Expand Up @@ -4,7 +4,6 @@

// ggml-backend internal header

#include "ggml.h"
#include "ggml-backend.h"

#ifdef __cplusplus
Expand All @@ -17,6 +16,7 @@ extern "C" {
// buffer type
typedef void * ggml_backend_buffer_type_context_t;

struct ggml_backend_buffer_type_i {
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
Expand Down
9 changes: 6 additions & 3 deletions llama.cpp/ggml-backend.c
Expand Up @@ -201,11 +201,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
ggml_backend_synchronize(backend);
}

void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
backend->iface.graph_compute(backend, cgraph);
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
if (!backend->iface.graph_compute(backend, cgraph)) {
return false;
}

// TODO: optional sync
ggml_backend_synchronize(backend);
return true;
}

bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
Expand Down Expand Up @@ -283,7 +286,7 @@ GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void *
GGML_UNUSED(user_data);
}

static void ggml_backend_registry_init(void) {
GGML_CALL static void ggml_backend_registry_init(void) {
static bool initialized = false;

if (initialized) {
Expand Down
4 changes: 2 additions & 2 deletions llama.cpp/ggml-backend.h
Expand Up @@ -13,7 +13,6 @@ extern "C" {
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t;
typedef void * ggml_backend_graph_plan_t;
struct ggml_backend_api;

//
// Backend buffer
Expand Down Expand Up @@ -61,7 +60,7 @@ extern "C" {

GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);

// tensor copy between different backends
Expand Down Expand Up @@ -188,6 +187,7 @@ extern "C" {
//
// dynamic shared object api
//
struct ggml_backend_api;
const struct ggml_backend_api *ggml_backend_api(void);

#ifdef __cplusplus
Expand Down

0 comments on commit 133b05e

Please sign in to comment.