From 4616816aacba97ff3849e25f9f0fc814f21b9c7a Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Sat, 6 Jan 2024 06:32:07 -0800 Subject: [PATCH] Fix support for multiple GPUs This change uses our new `__ms_abi__` trick to remove the ugly workarounds that we needed earlier to get ggml-backend to work --- llama.cpp/ggml-backend-impl.h | 95 ++++--- llama.cpp/ggml-backend.c | 111 +++++--- llama.cpp/ggml-backend.h | 31 +-- llama.cpp/ggml-cuda.cu | 460 ++++------------------------------ llama.cpp/ggml-cuda.h | 2 +- llama.cpp/ggml-metal.h | 1 + llama.cpp/ggml-metal.m | 335 +++++-------------------- llama.cpp/ggml.c | 32 +-- llama.cpp/ggml.h | 38 +-- llama.cpp/llava/clip.cpp | 1 - llamafile/cuda.c | 18 +- llamafile/metal.c | 17 +- 12 files changed, 324 insertions(+), 817 deletions(-) diff --git a/llama.cpp/ggml-backend-impl.h b/llama.cpp/ggml-backend-impl.h index 319ededb11..8859be9a25 100644 --- a/llama.cpp/ggml-backend-impl.h +++ b/llama.cpp/ggml-backend-impl.h @@ -1,7 +1,10 @@ +// -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi #pragma once // ggml-backend internal header +#include "ggml.h" #include "ggml-backend.h" #ifdef __cplusplus @@ -15,13 +18,13 @@ extern "C" { // buffer type typedef void * ggml_backend_buffer_type_context_t; struct ggml_backend_buffer_type_i { - ggml_backend_buffer_t (*GGML_BACKEND_ABI alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); - size_t (*GGML_BACKEND_ABI get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment - size_t (*GGML_BACKEND_ABI get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding - bool (*GGML_BACKEND_ABI supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend + ggml_backend_buffer_t (*GGML_ABI alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); + size_t (*GGML_ABI get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment + size_t (*GGML_ABI get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding + bool (*GGML_ABI supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend // check if tensor data is in host memory // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) - bool (*GGML_BACKEND_ABI is_host) (ggml_backend_buffer_type_t buft); + bool (*GGML_ABI is_host) (ggml_backend_buffer_type_t buft); }; struct ggml_backend_buffer_type { @@ -33,16 +36,16 @@ extern "C" { typedef void * ggml_backend_buffer_context_t; struct ggml_backend_buffer_i { - void (*GGML_BACKEND_ABI free_buffer) (ggml_backend_buffer_t buffer); - //void (*GGML_BACKEND_ABI reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras - void * (*GGML_BACKEND_ABI get_base) (ggml_backend_buffer_t buffer); - void (*GGML_BACKEND_ABI init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*GGML_BACKEND_ABI set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*GGML_BACKEND_ABI get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void (*GGML_ABI free_buffer) (ggml_backend_buffer_t buffer); + //void (*GGML_ABI reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras + void * (*GGML_ABI get_base) (ggml_backend_buffer_t buffer); + void (*GGML_ABI init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + void (*GGML_ABI set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*GGML_ABI get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) copy tensor between different buffer-type, allow for single-copy tranfers - void (*GGML_BACKEND_ABI cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*GGML_BACKEND_ABI cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*GGML_BACKEND_ABI clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*GGML_ABI cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*GGML_ABI cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*GGML_ABI clear) (ggml_backend_buffer_t buffer, uint8_t value); }; struct ggml_backend_buffer { @@ -56,7 +59,7 @@ extern "C" { ggml_backend_buffer_type_t buft, struct ggml_backend_buffer_i iface, ggml_backend_buffer_context_t context, - size_t size); + size_t size) GGML_ABI; // @@ -66,33 +69,33 @@ extern "C" { typedef void * ggml_backend_context_t; struct ggml_backend_i { - const char * (*GGML_BACKEND_ABI get_name)(ggml_backend_t backend); + const char * (*GGML_ABI get_name)(ggml_backend_t backend); - void (*GGML_BACKEND_ABI free)(ggml_backend_t backend); + void (*GGML_ABI free)(ggml_backend_t backend); // buffer allocation - ggml_backend_buffer_type_t (*GGML_BACKEND_ABI get_default_buffer_type)(ggml_backend_t backend); + ggml_backend_buffer_type_t (*GGML_ABI get_default_buffer_type)(ggml_backend_t backend); // (optional) asynchroneous tensor data access - void (*GGML_BACKEND_ABI set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*GGML_BACKEND_ABI get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void (*GGML_ABI set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*GGML_ABI get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) asynchroneous tensor copy - void (*GGML_BACKEND_ABI cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*GGML_BACKEND_ABI cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*GGML_ABI cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*GGML_ABI cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*GGML_BACKEND_ABI synchronize)(ggml_backend_t backend); + void (*GGML_ABI synchronize)(ggml_backend_t backend); // compute graph with a plan - ggml_backend_graph_plan_t (*GGML_BACKEND_ABI graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); - void (*GGML_BACKEND_ABI graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - void (*GGML_BACKEND_ABI graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + ggml_backend_graph_plan_t (*GGML_ABI graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + void (*GGML_ABI graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + void (*GGML_ABI graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan - void (*GGML_BACKEND_ABI graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); + void (*GGML_ABI graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation - bool (*GGML_BACKEND_ABI supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + bool (*GGML_ABI supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); }; struct ggml_backend { @@ -110,6 +113,42 @@ extern "C" { void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data); + // + // GGML Backend API + // + // This struct includes all functions that a backend module needs + // the application to define. + // + + struct ggml_backend_api { + void (*free)(void *); + void *(*malloc)(size_t); + typeof(ggml_backend_buffer_init) *GGML_ABI ggml_backend_buffer_init; + typeof(ggml_backend_cpu_buffer_from_ptr) *GGML_ABI ggml_backend_cpu_buffer_from_ptr; + typeof(ggml_backend_cpu_buffer_type) *GGML_ABI ggml_backend_cpu_buffer_type; + typeof(ggml_backend_buft_get_alloc_size) *GGML_ABI ggml_backend_buft_get_alloc_size; + typeof(ggml_backend_buft_alloc_buffer) *GGML_ABI ggml_backend_buft_alloc_buffer; + typeof(ggml_backend_is_cpu) *GGML_ABI ggml_backend_is_cpu; + typeof(ggml_backend_tensor_get) *GGML_ABI ggml_backend_tensor_get; + typeof(ggml_backend_tensor_set) *GGML_ABI ggml_backend_tensor_set; + typeof(ggml_is_quantized) *GGML_ABI ggml_is_quantized; + typeof(ggml_type_size) *GGML_ABI ggml_type_size; + typeof(ggml_blck_size) *GGML_ABI ggml_blck_size; + typeof(ggml_is_transposed) *GGML_ABI ggml_is_transposed; + typeof(ggml_nbytes) *GGML_ABI ggml_nbytes; + typeof(ggml_get_unary_op) *GGML_ABI ggml_get_unary_op; + typeof(ggml_nelements) *GGML_ABI ggml_nelements; + typeof(ggml_nrows) *GGML_ABI ggml_nrows; + typeof(ggml_is_permuted) *GGML_ABI ggml_is_permuted; + typeof(ggml_is_contiguous) *GGML_ABI ggml_is_contiguous; + typeof(ggml_op_name) *GGML_ABI ggml_op_name; + typeof(ggml_type_name) *GGML_ABI ggml_type_name; + typeof(ggml_element_size) *GGML_ABI ggml_element_size; + typeof(ggml_row_size) *GGML_ABI ggml_row_size; + typeof(ggml_rope_yarn_corr_dims) *GGML_ABI ggml_rope_yarn_corr_dims; + typeof(ggml_op_desc) *GGML_ABI ggml_op_desc; + }; + #ifdef __cplusplus } #endif diff --git a/llama.cpp/ggml-backend.c b/llama.cpp/ggml-backend.c index 87b8f26359..d4b37e937a 100644 --- a/llama.cpp/ggml-backend.c +++ b/llama.cpp/ggml-backend.c @@ -3,7 +3,9 @@ #include "ggml-backend-impl.h" #include "ggml-alloc.h" +#include "llama.cpp/ggml-metal.h" #include "ggml-impl.h" +#include "libc/thread/tls.h" #include #include @@ -11,6 +13,7 @@ #include #include #include +#include #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -18,7 +21,7 @@ // backend buffer type -ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_ABI ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { return buft->iface.alloc_buffer(buft, size); } @@ -26,7 +29,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) { return buft->iface.get_alignment(buft); } -size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { +GGML_ABI size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { // get_alloc_size is optional, defaults to ggml_nbytes if (buft->iface.get_alloc_size) { return buft->iface.get_alloc_size(buft, tensor); @@ -47,7 +50,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { // backend buffer -ggml_backend_buffer_t ggml_backend_buffer_init( +GGML_ABI ggml_backend_buffer_t ggml_backend_buffer_init( ggml_backend_buffer_type_t buft, struct ggml_backend_buffer_i iface, ggml_backend_buffer_context_t context, @@ -74,8 +77,7 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { if (buffer->iface.free_buffer != NULL) { buffer->iface.free_buffer(buffer); } - // TODO(llama.cpp): delete this file - // free(buffer); + free(buffer); } size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { @@ -90,7 +92,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { return base; } -void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { +GGML_ABI void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { // init_tensor is optional if (buffer->iface.init_tensor) { buffer->iface.init_tensor(buffer, tensor); @@ -160,7 +162,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten backend->iface.get_tensor_async(backend, tensor, data, offset, size); } -void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_ABI void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); @@ -168,7 +170,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size); } -void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_ABI void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); @@ -274,6 +276,13 @@ static size_t ggml_backend_registry_count = 0; static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data); +ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) { + return ggml_backend_metal_init(); + + GGML_UNUSED(params); + GGML_UNUSED(user_data); +} + static void ggml_backend_registry_init(void) { static bool initialized = false; @@ -284,18 +293,11 @@ static void ggml_backend_registry_init(void) { initialized = true; ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL); + ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL); // add forward decls here to avoid including the backend headers - if (llamafile_gpu_supported() == LLAMAFILE_GPU_NVIDIA) { extern int ggml_backend_cuda_reg_devices(void); ggml_backend_cuda_reg_devices(); - } - - if (llamafile_gpu_supported() == LLAMAFILE_GPU_APPLE) { - extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); - extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); - ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL); - } } void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { @@ -393,39 +395,39 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) { // backend CPU -GGML_BACKEND_ABI static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_ABI static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { return (void *)buffer->context; } -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_ABI static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { free(buffer->context); } -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_ABI static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); GGML_UNUSED(buffer); } -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_ABI static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); GGML_UNUSED(buffer); } -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_ABI static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); GGML_UNUSED(buffer); } -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +GGML_ABI static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); GGML_UNUSED(buffer); } -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_ABI static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { memset(buffer->context, value, buffer->size); } @@ -454,7 +456,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 -GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_ABI static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? @@ -463,25 +465,25 @@ GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size); } -GGML_BACKEND_ABI static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_ABI static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return TENSOR_ALIGNMENT; GGML_UNUSED(buft); } -GGML_BACKEND_ABI static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_ABI static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_cpu(backend); GGML_UNUSED(buft); } -GGML_BACKEND_ABI static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +GGML_ABI static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; GGML_UNUSED(buft); } -ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { +GGML_ABI ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { /* .iface = */ { /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, @@ -545,20 +547,20 @@ struct ggml_backend_cpu_context { size_t work_size; }; -GGML_BACKEND_ABI static const char * ggml_backend_cpu_name(ggml_backend_t backend) { +GGML_ABI static const char * ggml_backend_cpu_name(ggml_backend_t backend) { return "CPU"; GGML_UNUSED(backend); } -GGML_BACKEND_ABI static void ggml_backend_cpu_free(ggml_backend_t backend) { +GGML_ABI static void ggml_backend_cpu_free(ggml_backend_t backend) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; free(cpu_ctx->work_data); free(cpu_ctx); free(backend); } -GGML_BACKEND_ABI static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { +GGML_ABI static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_cpu_buffer_type(); GGML_UNUSED(backend); @@ -569,7 +571,7 @@ struct ggml_backend_plan_cpu { struct ggml_cgraph cgraph; }; -GGML_BACKEND_ABI static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_ABI static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); @@ -584,7 +586,7 @@ GGML_BACKEND_ABI static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_cr return cpu_plan; } -GGML_BACKEND_ABI static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_ABI static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; free(cpu_plan->cplan.work_data); @@ -593,7 +595,7 @@ GGML_BACKEND_ABI static void ggml_backend_cpu_graph_plan_free(ggml_backend_t bac GGML_UNUSED(backend); } -GGML_BACKEND_ABI static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_ABI static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); @@ -601,7 +603,7 @@ GGML_BACKEND_ABI static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t GGML_UNUSED(backend); } -GGML_BACKEND_ABI static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_ABI static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); @@ -617,7 +619,7 @@ GGML_BACKEND_ABI static void ggml_backend_cpu_graph_compute(ggml_backend_t backe ggml_graph_compute(cgraph, &cplan); } -GGML_BACKEND_ABI static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { +GGML_ABI static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { switch (op->op) { case GGML_OP_MUL_MAT: return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; @@ -660,7 +662,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { return cpu_backend; } -bool ggml_backend_is_cpu(ggml_backend_t backend) { +GGML_ABI bool ggml_backend_is_cpu(ggml_backend_t backend) { return backend->iface.get_name == ggml_backend_cpu_name; } @@ -671,7 +673,7 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } -ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { +GGML_ABI ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size); } @@ -1429,3 +1431,36 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t ggml_backend_graph_copy_free(copy); } + +static const struct ggml_backend_api kGgmlBackendApi = { + free, + malloc, + ggml_backend_buffer_init, + ggml_backend_cpu_buffer_from_ptr, + ggml_backend_cpu_buffer_type, + ggml_backend_buft_get_alloc_size, + ggml_backend_buft_alloc_buffer, + ggml_backend_is_cpu, + ggml_backend_tensor_get, + ggml_backend_tensor_set, + ggml_is_quantized, + ggml_type_size, + ggml_blck_size, + ggml_is_transposed, + ggml_nbytes, + ggml_get_unary_op, + ggml_nelements, + ggml_nrows, + ggml_is_permuted, + ggml_is_contiguous, + ggml_op_name, + ggml_type_name, + ggml_element_size, + ggml_row_size, + ggml_rope_yarn_corr_dims, + ggml_op_desc, +}; + +const struct ggml_backend_api *ggml_backend_api(void) { + return &kGgmlBackendApi; +} diff --git a/llama.cpp/ggml-backend.h b/llama.cpp/ggml-backend.h index 6ad670c7f2..f6afad623f 100644 --- a/llama.cpp/ggml-backend.h +++ b/llama.cpp/ggml-backend.h @@ -1,14 +1,10 @@ +// -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi #pragma once #include "ggml.h" #include "ggml-alloc.h" -#if defined(_WIN32) -#define GGML_BACKEND_ABI -#else -#define GGML_BACKEND_ABI __attribute__((__ms_abi__)) -#endif - #ifdef __cplusplus extern "C" { #endif @@ -17,15 +13,16 @@ extern "C" { typedef struct ggml_backend_buffer * ggml_backend_buffer_t; typedef struct ggml_backend * ggml_backend_t; typedef void * ggml_backend_graph_plan_t; + struct ggml_backend_api; // // Backend buffer // // buffer type - GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size); + GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) GGML_ABI; GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); - GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); + GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) GGML_ABI; GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); @@ -33,7 +30,7 @@ extern "C" { GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) GGML_ABI; GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); @@ -55,8 +52,8 @@ extern "C" { GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) GGML_ABI; + GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) GGML_ABI; GGML_API void ggml_backend_synchronize(ggml_backend_t backend); @@ -77,13 +74,13 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_cpu_init(void); - GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend); + GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend) GGML_ABI; GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); // Create a backend buffer from an existing pointer - GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); + GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) GGML_ABI; - GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); + GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) GGML_ABI; #ifdef GGML_USE_CPU_HBM GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); @@ -179,7 +176,7 @@ extern "C" { GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); - typedef bool (*GGML_BACKEND_ABI ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); + typedef bool (*GGML_ABI ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); // Compare the output of two backends GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); @@ -188,6 +185,10 @@ extern "C" { GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + // + // dynamic shared object api + // + const struct ggml_backend_api *ggml_backend_api(void); #ifdef __cplusplus } diff --git a/llama.cpp/ggml-cuda.cu b/llama.cpp/ggml-cuda.cu index 86075d9d43..583c884945 100644 --- a/llama.cpp/ggml-cuda.cu +++ b/llama.cpp/ggml-cuda.cu @@ -242,7 +242,6 @@ #undef MIN #undef MAX - #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -250,296 +249,32 @@ #define M_PI 3.14159265358979323846 #endif -//////////////////////////////////////////////////////////////////////////////// -// BEGIN: COPIED FROM GGML.C - -#define ggml_is_quantized ggml_is_quantized_ -static bool ggml_is_quantized(enum ggml_type type) { - switch (type) { - case GGML_TYPE_I8: - return false; - case GGML_TYPE_I16: - return false; - case GGML_TYPE_I32: - return false; - case GGML_TYPE_F32: - return false; - case GGML_TYPE_F16: - return false; - case GGML_TYPE_Q4_0: - return true; - case GGML_TYPE_Q4_1: - return true; - case GGML_TYPE_Q5_0: - return true; - case GGML_TYPE_Q5_1: - return true; - case GGML_TYPE_Q8_0: - return true; - case GGML_TYPE_Q8_1: - return true; - case GGML_TYPE_Q2_K: - return true; - case GGML_TYPE_Q3_K: - return true; - case GGML_TYPE_Q4_K: - return true; - case GGML_TYPE_Q5_K: - return true; - case GGML_TYPE_Q6_K: - return true; - case GGML_TYPE_Q8_K: - return true; - default: - return false; - } -} - -#define ggml_type_size ggml_type_size_ -static size_t ggml_type_size(enum ggml_type type); - -#define ggml_blck_size ggml_blck_size_ -static int ggml_blck_size(enum ggml_type type); - -#define ggml_is_transposed ggml_is_transposed_ -static bool ggml_is_transposed(const struct ggml_tensor * tensor) { - return tensor->nb[0] > tensor->nb[1]; -} - -#define ggml_nbytes ggml_nbytes_ -static size_t ggml_nbytes(const struct ggml_tensor * tensor) { - size_t nbytes; - size_t blck_size = ggml_blck_size(tensor->type); - if (blck_size == 1) { - nbytes = ggml_type_size(tensor->type); - for (int i = 0; i < GGML_MAX_DIMS; ++i) { - nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; - } - } - else { - nbytes = tensor->ne[0]*tensor->nb[0]/blck_size; - for (int i = 1; i < GGML_MAX_DIMS; ++i) { - nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; - } - } - return nbytes; -} - -static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) { - return ((const int32_t *)(tensor->op_params))[i]; -} - -#define ggml_get_unary_op ggml_get_unary_op_ -static enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { - return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0); -} - -#define ggml_nelements ggml_nelements_ -static int64_t ggml_nelements(const struct ggml_tensor * tensor) { - return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; -} - -#define ggml_nrows ggml_nrows_ -static int64_t ggml_nrows(const struct ggml_tensor * tensor) { - return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; -} - -#define ggml_is_permuted ggml_is_permuted_ -static bool ggml_is_permuted(const struct ggml_tensor * tensor) { - return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; -} - -#define ggml_is_contiguous ggml_is_contiguous_ -static bool ggml_is_contiguous(const struct ggml_tensor * tensor) { - return - tensor->nb[0] == ggml_type_size(tensor->type) && - tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && - tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && - tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; -} - -#define ggml_op_name ggml_op_name_ -static const char * ggml_op_name(enum ggml_op op) { - return "REDACTED!GGML_OP_NAME[op]"; -} - -#define ggml_type_name ggml_type_name_ -static const char * ggml_type_name(enum ggml_type type) { - return "REDACTED!GGML_TYPE_NAME[type]"; -} - -#define ggml_element_size ggml_element_size_ -static size_t ggml_element_size(const struct ggml_tensor * tensor) { - return ggml_type_size(tensor->type); -} - -#define ggml_row_size ggml_row_size_ -static size_t ggml_row_size(enum ggml_type type, int64_t ne) { - return ggml_type_size(type)*ne/ggml_blck_size(type); -} - -static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) { - return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); -} - -#define ggml_rope_yarn_corr_dims ggml_rope_yarn_corr_dims_ -static void ggml_rope_yarn_corr_dims( - int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] -) { - // start and end correction dims - dims[0] = MAX(0, floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base))); - dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base))); -} - -// END: COPIED FROM GGML.C -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// BEGIN: COPIED FROM GGML-BACKEND.C - -#define ggml_backend_buffer_init ggml_backend_buffer_init_ -static ggml_backend_buffer_t ggml_backend_buffer_init( - ggml_backend_buffer_type_t buft, - struct ggml_backend_buffer_i iface, - ggml_backend_buffer_context_t context, - size_t size) { - ggml_backend_buffer_t buffer = (ggml_backend_buffer_t)malloc(sizeof(struct ggml_backend_buffer)); - memset(buffer, 0, sizeof(*buffer)); - buffer->iface = iface; - buffer->buft = buft; - buffer->context = context; - buffer->size = size; - return buffer; -} - -#define ggml_backend_cpu_buffer_type ggml_backend_cpu_buffer_type_ -static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); - -GGML_BACKEND_ABI static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer); -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - -// for buffers from ptr, free is not called -static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { - /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed - /* .get_base = */ ggml_backend_cpu_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required - /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, - /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, -}; - -#define ggml_backend_cpu_buffer_from_ptr ggml_backend_cpu_buffer_from_ptr_ -static ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { - return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size); -} - -GGML_BACKEND_ABI static const char * ggml_backend_cpu_name(ggml_backend_t backend) { - return "CPU"; - GGML_UNUSED(backend); +static const struct ggml_backend_api *g_backend; +#define ggml_is_quantized g_backend->ggml_is_quantized +#define ggml_type_size g_backend->ggml_type_size +#define ggml_blck_size g_backend->ggml_blck_size +#define ggml_is_transposed g_backend->ggml_is_transposed +#define ggml_nbytes g_backend->ggml_nbytes +#define ggml_get_unary_op g_backend->ggml_get_unary_op +#define ggml_nelements g_backend->ggml_nelements +#define ggml_nrows g_backend->ggml_nrows +#define ggml_is_permuted g_backend->ggml_is_permuted +#define ggml_is_contiguous g_backend->ggml_is_contiguous +#define ggml_op_name g_backend->ggml_op_name +#define ggml_type_name g_backend->ggml_type_name +#define ggml_element_size g_backend->ggml_element_size +#define ggml_row_size g_backend->ggml_row_size +#define ggml_element_size g_backend->ggml_element_size +#define ggml_element_size g_backend->ggml_element_size +#define ggml_rope_yarn_corr_dims g_backend->ggml_rope_yarn_corr_dims +#define ggml_backend_buft_alloc_buffer g_backend->ggml_backend_buft_alloc_buffer +#define ggml_backend_buft_get_alloc_size g_backend->ggml_backend_buft_get_alloc_size +#define ggml_backend_cpu_buffer_type g_backend->ggml_backend_cpu_buffer_type + +void ggml_cuda_link(const struct ggml_backend_api *backend_api) { + g_backend = backend_api; } -#define ggml_backend_is_cpu ggml_backend_is_cpu_ -static bool ggml_backend_is_cpu(ggml_backend_t backend) { - return backend->iface.get_name == ggml_backend_cpu_name; -} - -static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 - -GGML_BACKEND_ABI static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return TENSOR_ALIGNMENT; - GGML_UNUSED(buft); -} - -GGML_BACKEND_ABI static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_cpu(backend); - GGML_UNUSED(buft); -} - -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { - fprintf(stderr, "WARNING: using untested foreign free due to ggml backend\n"); - free(buffer->context); - GGML_UNUSED(buffer); -} - -GGML_BACKEND_ABI static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { - return (void *)buffer->context; -} - -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - memcpy((char *)tensor->data + offset, data, size); - GGML_UNUSED(buffer); -} - -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - memcpy(data, (const char *)tensor->data + offset, size); - GGML_UNUSED(buffer); -} - -#define ggml_backend_tensor_get ggml_backend_tensor_get_ -static void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size); -} - -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); - GGML_UNUSED(buffer); -} - -#define ggml_backend_tensor_set ggml_backend_tensor_set_ -static void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size); -} - -GGML_BACKEND_ABI static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); - GGML_UNUSED(buffer); -} - -static struct ggml_backend_buffer_i cpu_backend_buffer_i = { - /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, - /* .get_base = */ ggml_backend_cpu_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required - /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, - /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, -}; - -GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned - void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? - return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size); -} - -static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { - static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = { - /* .iface = */ { - /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, - }, - /* .context = */ NULL, - }; - return &ggml_backend_buffer_type_cpu; -} - -#define ggml_backend_buft_get_alloc_size ggml_backend_buft_get_alloc_size_ -static size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) { - // get_alloc_size is optional, defaults to ggml_nbytes - if (buft->iface.get_alloc_size) { - return buft->iface.get_alloc_size(buft, tensor); - } - return ggml_nbytes(tensor); -} - -// END: COPIED FROM GGML-BACKEND.C -//////////////////////////////////////////////////////////////////////////////// - #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products #define CC_VOLTA 700 #define CC_OFFSET_AMD 1000000 @@ -7229,7 +6964,7 @@ bool ggml_cublas_loaded(void) { return g_cublas_loaded; } -void ggml_init_cublas() { +void ggml_init_cublas(void) { static bool initialized = false; if (!initialized) { @@ -10069,18 +9804,18 @@ struct ggml_backend_buffer_context_cuda { } }; -GGML_BACKEND_ABI static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_ABI static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; CUDA_CHECK(cudaFree(ctx->dev_ptr)); delete ctx; } -GGML_BACKEND_ABI static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_ABI static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; return ctx->dev_ptr; } -GGML_BACKEND_ABI static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { +GGML_ABI static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; if (tensor->view_src != NULL && tensor->view_offs == 0) { @@ -10114,7 +9849,7 @@ GGML_BACKEND_ABI static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_b UNUSED(buffer); } -GGML_BACKEND_ABI static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_ABI static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; @@ -10125,7 +9860,7 @@ GGML_BACKEND_ABI static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_bu CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice)); } -GGML_BACKEND_ABI static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_ABI static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; @@ -10136,7 +9871,7 @@ GGML_BACKEND_ABI static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_bu CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); } -GGML_BACKEND_ABI static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_ABI static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; ggml_cuda_set_device(ctx->device); @@ -10158,7 +9893,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { // cuda buffer type -GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_ABI static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { int device = (int) (intptr_t) buft->context; ggml_cuda_set_device(device); @@ -10170,16 +9905,16 @@ GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_allo ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr); - return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size); + return g_backend->ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size); } -GGML_BACKEND_ABI static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_ABI static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; UNUSED(buft); } -GGML_BACKEND_ABI static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { +GGML_ABI static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { int64_t row_low = 0; int64_t row_high = ggml_nrows(tensor); int64_t nrows_split = row_high - row_low; @@ -10199,7 +9934,7 @@ GGML_BACKEND_ABI static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml UNUSED(buft); } -GGML_BACKEND_ABI static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { +GGML_ABI static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_cuda(backend); UNUSED(buft); @@ -10233,21 +9968,20 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { // host buffer type -GGML_BACKEND_ABI static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_ABI static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_cuda_host_free(buffer->context); } -GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +GGML_ABI static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { void * ptr = ggml_cuda_host_malloc(size); if (ptr == nullptr) { // fallback to cpu buffer - // return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); - GGML_ASSERT(!"ran out of gpu memory"); + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); } // FIXME: this is a hack to avoid having to implement a new buffer type - ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + ggml_backend_buffer_t buffer = g_backend->ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; @@ -10275,26 +10009,26 @@ struct ggml_backend_context_cuda { int device; }; -GGML_BACKEND_ABI static const char * ggml_backend_cuda_name(ggml_backend_t backend) { +GGML_ABI static const char * ggml_backend_cuda_name(ggml_backend_t backend) { return GGML_CUDA_NAME; UNUSED(backend); } -GGML_BACKEND_ABI static void ggml_backend_cuda_free(ggml_backend_t backend) { +GGML_ABI static void ggml_backend_cuda_free(ggml_backend_t backend) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; delete cuda_ctx; delete backend; } -GGML_BACKEND_ABI static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { +GGML_ABI static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; return ggml_backend_cuda_buffer_type(cuda_ctx->device); } -GGML_BACKEND_ABI static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_ABI static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); @@ -10303,7 +10037,7 @@ GGML_BACKEND_ABI static void ggml_backend_cuda_set_tensor_async(ggml_backend_t b CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); } -GGML_BACKEND_ABI static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_ABI static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); @@ -10312,7 +10046,7 @@ GGML_BACKEND_ABI static void ggml_backend_cuda_get_tensor_async(ggml_backend_t b CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); } -GGML_BACKEND_ABI static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { +GGML_ABI static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0])); @@ -10320,7 +10054,7 @@ GGML_BACKEND_ABI static void ggml_backend_cuda_synchronize(ggml_backend_t backen UNUSED(backend); } -GGML_BACKEND_ABI static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_ABI static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { GGML_ASSERT(!"not implemented"); return nullptr; @@ -10329,21 +10063,21 @@ GGML_BACKEND_ABI static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_c UNUSED(cgraph); } -GGML_BACKEND_ABI static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_ABI static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(!"not implemented"); UNUSED(backend); UNUSED(plan); } -GGML_BACKEND_ABI static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +GGML_ABI static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(!"not implemented"); UNUSED(backend); UNUSED(plan); } -GGML_BACKEND_ABI static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_ABI static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; ggml_cuda_set_main_device(cuda_ctx->device); @@ -10402,7 +10136,7 @@ GGML_BACKEND_ABI static void ggml_backend_cuda_graph_compute(ggml_backend_t back UNUSED(backend); } -GGML_BACKEND_ABI static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { +GGML_ABI static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) { switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -10530,8 +10264,6 @@ static ggml_backend_i cuda_backend_i = { }; ggml_backend_t ggml_backend_cuda_init(int device) { - ggml_init_cublas(); // TODO: remove from ggml.c - if (device < 0 || device >= ggml_cuda_get_device_count()) { fprintf(stderr, "%s: error: invalid device %d\n", __func__, device); return nullptr; @@ -10562,93 +10294,3 @@ ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) UNUSED(params); } - -static int ggml_blck_size(enum ggml_type type) { - switch (type) { - case GGML_TYPE_I8: - return 1; - case GGML_TYPE_I16: - return 1; - case GGML_TYPE_I32: - return 1; - case GGML_TYPE_F32: - return 1; - case GGML_TYPE_F16: - return 1; - case GGML_TYPE_Q4_0: - return QK4_0; - case GGML_TYPE_Q4_1: - return QK4_1; - case GGML_TYPE_Q5_0: - return QK5_0; - case GGML_TYPE_Q5_1: - return QK5_1; - case GGML_TYPE_Q8_0: - return QK8_0; - case GGML_TYPE_Q8_1: - return QK8_1; - case GGML_TYPE_Q2_K: - return QK_K; - case GGML_TYPE_Q3_K: - return QK_K; - case GGML_TYPE_Q4_K: - return QK_K; - case GGML_TYPE_Q5_K: - return QK_K; - case GGML_TYPE_Q6_K: - return QK_K; - case GGML_TYPE_Q8_K: - return QK_K; - default: - return 0; - } -} - -// This is only used for intermediate quantization and dot products -typedef struct { - float d; // delta - int8_t qs[QK_K]; // quants - int16_t bsums[QK_K/16]; // sum of quants in groups of 16 -} block_q8_K; -static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding"); - -static size_t ggml_type_size(enum ggml_type type) { - switch (type) { - case GGML_TYPE_I8: - return sizeof(int8_t); - case GGML_TYPE_I16: - return sizeof(int16_t); - case GGML_TYPE_I32: - return sizeof(int32_t); - case GGML_TYPE_F32: - return sizeof(float); - case GGML_TYPE_F16: - return sizeof(ggml_fp16_t); - case GGML_TYPE_Q4_0: - return sizeof(block_q4_0); - case GGML_TYPE_Q4_1: - return sizeof(block_q4_1); - case GGML_TYPE_Q5_0: - return sizeof(block_q5_0); - case GGML_TYPE_Q5_1: - return sizeof(block_q5_1); - case GGML_TYPE_Q8_0: - return sizeof(block_q8_0); - case GGML_TYPE_Q8_1: - return sizeof(block_q8_1); - case GGML_TYPE_Q2_K: - return sizeof(block_q2_K); - case GGML_TYPE_Q3_K: - return sizeof(block_q3_K); - case GGML_TYPE_Q4_K: - return sizeof(block_q4_K); - case GGML_TYPE_Q5_K: - return sizeof(block_q5_K); - case GGML_TYPE_Q6_K: - return sizeof(block_q6_K); - case GGML_TYPE_Q8_K: - return sizeof(block_q8_K); - default: - return 0; - } -} diff --git a/llama.cpp/ggml-cuda.h b/llama.cpp/ggml-cuda.h index 23a67561b8..a2e4b0ef9e 100644 --- a/llama.cpp/ggml-cuda.h +++ b/llama.cpp/ggml-cuda.h @@ -14,8 +14,8 @@ extern "C" { // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`. GGML_API void ggml_init_cublas(void); -GGML_API void ggml_cuda_disable(void); GGML_API bool ggml_cuda_supported(void); +GGML_API void ggml_cuda_link(const struct ggml_backend_api *); // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`. GGML_API bool ggml_cublas_loaded(void); diff --git a/llama.cpp/ggml-metal.h b/llama.cpp/ggml-metal.h index db09b61137..642ffc9a81 100644 --- a/llama.cpp/ggml-metal.h +++ b/llama.cpp/ggml-metal.h @@ -44,6 +44,7 @@ extern "C" { struct ggml_metal_context; bool ggml_metal_supported(void); +void ggml_metal_link(const struct ggml_backend_api *); void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); // number of command buffers to use diff --git a/llama.cpp/ggml-metal.m b/llama.cpp/ggml-metal.m index 9db8d3b084..53c344521d 100644 --- a/llama.cpp/ggml-metal.m +++ b/llama.cpp/ggml-metal.m @@ -3,55 +3,13 @@ #import "ggml-metal.h" #import "ggml-quants.h" -#import "ggml-backend-impl.h" #import "ggml.h" +#import "ggml-backend-impl.h" #import #import -//////////////////////////////////////////////////////////////////////////////// -// BEGIN: COPIED FROM GGML-BACKEND.C - -#define ggml_backend_buffer_init ggml_backend_buffer_init_ -static ggml_backend_buffer_t ggml_backend_buffer_init( - ggml_backend_buffer_type_t buft, - struct ggml_backend_buffer_i iface, - ggml_backend_buffer_context_t context, - size_t size) { - ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer)); - (*buffer) = (struct ggml_backend_buffer) { - /* .interface = */ iface, - /* .buft = */ buft, - /* .context = */ context, - /* .size = */ size, - }; - return buffer; -} - -GGML_BACKEND_ABI static const char * ggml_backend_cpu_name(ggml_backend_t backend) { - return "CPU"; - GGML_UNUSED(backend); -} - -#define ggml_backend_is_cpu ggml_backend_is_cpu_ -static bool ggml_backend_is_cpu(ggml_backend_t backend) { - return backend->iface.get_name == ggml_backend_cpu_name; -} - -#define ggml_backend_tensor_get ggml_backend_tensor_get_ -static void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size); -} - -#define ggml_backend_tensor_set ggml_backend_tensor_set_ -static void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size); -} - -// END: COPIED FROM GGML-BACKEND.C -//////////////////////////////////////////////////////////////////////////////// - #undef MIN #undef MAX #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -71,195 +29,20 @@ static void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * da #define GGML_MAX_CONCUR (2*GGML_DEFAULT_GRAPH_SIZE) -//////////////////////////////////////////////////////////////////////////////// -// BEGIN: COPIED FROM GGML.C - -#define ggml_is_quantized ggml_is_quantized_ -static bool ggml_is_quantized(enum ggml_type type) { - switch (type) { - case GGML_TYPE_I8: - return false; - case GGML_TYPE_I16: - return false; - case GGML_TYPE_I32: - return false; - case GGML_TYPE_F32: - return false; - case GGML_TYPE_F16: - return false; - case GGML_TYPE_Q4_0: - return true; - case GGML_TYPE_Q4_1: - return true; - case GGML_TYPE_Q5_0: - return true; - case GGML_TYPE_Q5_1: - return true; - case GGML_TYPE_Q8_0: - return true; - case GGML_TYPE_Q8_1: - return true; - case GGML_TYPE_Q2_K: - return true; - case GGML_TYPE_Q3_K: - return true; - case GGML_TYPE_Q4_K: - return true; - case GGML_TYPE_Q5_K: - return true; - case GGML_TYPE_Q6_K: - return true; - case GGML_TYPE_Q8_K: - return true; - default: - return false; - } -} - -#define ggml_type_size ggml_type_size_ -static size_t ggml_type_size(enum ggml_type type) { - switch (type) { - case GGML_TYPE_I8: - return sizeof(int8_t); - case GGML_TYPE_I16: - return sizeof(int16_t); - case GGML_TYPE_I32: - return sizeof(int32_t); - case GGML_TYPE_F32: - return sizeof(float); - case GGML_TYPE_F16: - return sizeof(ggml_fp16_t); - case GGML_TYPE_Q4_0: - return sizeof(block_q4_0); - case GGML_TYPE_Q4_1: - return sizeof(block_q4_1); - case GGML_TYPE_Q5_0: - return sizeof(block_q5_0); - case GGML_TYPE_Q5_1: - return sizeof(block_q5_1); - case GGML_TYPE_Q8_0: - return sizeof(block_q8_0); - case GGML_TYPE_Q8_1: - return sizeof(block_q8_1); - case GGML_TYPE_Q2_K: - return sizeof(block_q2_K); - case GGML_TYPE_Q3_K: - return sizeof(block_q3_K); - case GGML_TYPE_Q4_K: - return sizeof(block_q4_K); - case GGML_TYPE_Q5_K: - return sizeof(block_q5_K); - case GGML_TYPE_Q6_K: - return sizeof(block_q6_K); - case GGML_TYPE_Q8_K: - return sizeof(block_q8_K); - default: - return 0; - } -} - -#define ggml_blck_size ggml_blck_size_ -static int ggml_blck_size(enum ggml_type type) { - switch (type) { - case GGML_TYPE_I8: - return 1; - case GGML_TYPE_I16: - return 1; - case GGML_TYPE_I32: - return 1; - case GGML_TYPE_F32: - return 1; - case GGML_TYPE_F16: - return 1; - case GGML_TYPE_Q4_0: - return QK4_0; - case GGML_TYPE_Q4_1: - return QK4_1; - case GGML_TYPE_Q5_0: - return QK5_0; - case GGML_TYPE_Q5_1: - return QK5_1; - case GGML_TYPE_Q8_0: - return QK8_0; - case GGML_TYPE_Q8_1: - return QK8_1; - case GGML_TYPE_Q2_K: - return QK_K; - case GGML_TYPE_Q3_K: - return QK_K; - case GGML_TYPE_Q4_K: - return QK_K; - case GGML_TYPE_Q5_K: - return QK_K; - case GGML_TYPE_Q6_K: - return QK_K; - case GGML_TYPE_Q8_K: - return QK_K; - default: - return 0; - } -} - -#define ggml_is_transposed ggml_is_transposed_ -static bool ggml_is_transposed(const struct ggml_tensor * tensor) { - return tensor->nb[0] > tensor->nb[1]; -} - -#define ggml_nbytes ggml_nbytes_ -static size_t ggml_nbytes(const struct ggml_tensor * tensor) { - size_t nbytes; - size_t blck_size = ggml_blck_size(tensor->type); - if (blck_size == 1) { - nbytes = ggml_type_size(tensor->type); - for (int i = 0; i < GGML_MAX_DIMS; ++i) { - nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; - } - } - else { - nbytes = tensor->ne[0]*tensor->nb[0]/blck_size; - for (int i = 1; i < GGML_MAX_DIMS; ++i) { - nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; - } - } - return nbytes; -} - -static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) { - return ((const int32_t *)(tensor->op_params))[i]; -} - -#define ggml_get_unary_op ggml_get_unary_op_ -static enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { - return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0); -} - -#define ggml_nelements ggml_nelements_ -static int64_t ggml_nelements(const struct ggml_tensor * tensor) { - return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; -} - -#define ggml_nrows ggml_nrows_ -static int64_t ggml_nrows(const struct ggml_tensor * tensor) { - return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; -} - -#define ggml_is_contiguous ggml_is_contiguous_ -static bool ggml_is_contiguous(const struct ggml_tensor * tensor) { - return - tensor->nb[0] == ggml_type_size(tensor->type) && - tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && - tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && - tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; -} - -#define ggml_op_name ggml_op_name_ -static const char * ggml_op_name(enum ggml_op op) { - return "REDACTED!GGML_OP_NAME[op]"; -} - -#define ggml_op_desc ggml_op_desc_ -static const char * ggml_op_desc(const struct ggml_tensor * t) { - return "REDACTED!GGML_OP_DESC[t]"; +static const struct ggml_backend_api *g_backend; +#define ggml_type_size g_backend->ggml_type_size +#define ggml_blck_size g_backend->ggml_blck_size +#define ggml_is_transposed g_backend->ggml_is_transposed +#define ggml_nbytes g_backend->ggml_nbytes +#define ggml_get_unary_op g_backend->ggml_get_unary_op +#define ggml_nelements g_backend->ggml_nelements +#define ggml_nrows g_backend->ggml_nrows +#define ggml_is_contiguous g_backend->ggml_is_contiguous +#define ggml_op_name g_backend->ggml_op_name +#define ggml_op_desc g_backend->ggml_op_desc + +void ggml_metal_link(const struct ggml_backend_api *backend_api) { + g_backend = backend_api; } // END: COPIED FROM GGML.C @@ -438,7 +221,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ if (len < 128) { ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data); } else { - char* buffer2 = malloc(len+1); + char* buffer2 = g_backend->malloc(len+1); va_end(args); va_start(args, format); vsnprintf(buffer2, len+1, format, args); @@ -471,7 +254,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]); // Configure context - struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + struct ggml_metal_context * ctx = g_backend->malloc(sizeof(struct ggml_metal_context)); ctx->device = device; ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->queue = [ctx->device newCommandQueue]; @@ -826,7 +609,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { dispatch_release(ctx->d_queue); - free(ctx); + g_backend->free(ctx); } void * ggml_metal_host_malloc(size_t n) { @@ -1757,7 +1540,7 @@ void ggml_metal_graph_compute( !ggml_is_transposed(src1) && src1t == GGML_TYPE_F32 && ne00 % 32 == 0 && ne00 >= 64 && - (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { + (ne11 > ne11_mm_min || (g_backend->ggml_is_quantized(src0t) && ne12 > 1))) { //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); switch (src0->type) { case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break; @@ -1891,7 +1674,7 @@ void ggml_metal_graph_compute( } }; - if (ggml_is_quantized(src0t)) { + if (g_backend->ggml_is_quantized(src0t)) { GGML_ASSERT(ne00 >= nth0*nth1); } @@ -2131,7 +1914,7 @@ void ggml_metal_graph_compute( } }; - if (ggml_is_quantized(src2t)) { + if (g_backend->ggml_is_quantized(src2t)) { GGML_ASSERT(ne20 >= nth0*nth1); } @@ -2672,13 +2455,13 @@ static void ggml_backend_metal_free_device(void) { } } -GGML_BACKEND_ABI static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { +GGML_ABI static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; return ctx->all_data; } -GGML_BACKEND_ABI static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { +GGML_ABI static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; for (int i = 0; i < ctx->n_buffers; i++) { @@ -2690,34 +2473,38 @@ GGML_BACKEND_ABI static void ggml_backend_metal_buffer_free_buffer(ggml_backend_ free(ctx->all_data); } - free(ctx); + g_backend->free(ctx); } -GGML_BACKEND_ABI static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +GGML_ABI static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); UNUSED(buffer); } -GGML_BACKEND_ABI static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { +GGML_ABI static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { memcpy(data, (const char *)tensor->data + offset, size); UNUSED(buffer); } -GGML_BACKEND_ABI static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); +GGML_ABI static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { + printf("START g_backend->ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));\n"); + g_backend->ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); + printf(" END g_backend->ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));\n"); UNUSED(buffer); } -GGML_BACKEND_ABI static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); +GGML_ABI static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { + printf("START g_backend->ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));\n"); + g_backend->ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); + printf(" END g_backend->ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));\n"); UNUSED(buffer); } -GGML_BACKEND_ABI static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { +GGML_ABI static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; memset(ctx->all_data, value, ctx->all_size); @@ -2736,8 +2523,8 @@ GGML_BACKEND_ABI static void ggml_backend_metal_buffer_clear(ggml_backend_buffer // default buffer type -GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); +GGML_ABI static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + struct ggml_backend_metal_buffer_context * ctx = g_backend->malloc(sizeof(struct ggml_backend_metal_buffer_context)); const size_t size_page = sysconf(_SC_PAGESIZE); @@ -2762,7 +2549,7 @@ GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_metal_buffer_type_all if (ctx->buffers[0].metal == nil) { GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); - free(ctx); + g_backend->free(ctx); ggml_backend_metal_free_device(); return NULL; } @@ -2785,21 +2572,25 @@ GGML_BACKEND_ABI static ggml_backend_buffer_t ggml_backend_metal_buffer_type_all #endif - return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); + printf("START ggml_backend_buffer_t x = g_backend->ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);\n"); + ggml_backend_buffer_t x = g_backend->ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); + printf(" END ggml_backend_buffer_t x = g_backend->ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);\n"); + + return x; } -GGML_BACKEND_ABI static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_ABI static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 32; UNUSED(buft); } -GGML_BACKEND_ABI static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend); +GGML_ABI static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + return ggml_backend_is_metal(backend) || g_backend->ggml_backend_is_cpu(backend); UNUSED(buft); } -GGML_BACKEND_ABI static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { +GGML_ABI static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; UNUSED(buft); @@ -2823,7 +2614,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { // buffer from ptr ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { - struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); + struct ggml_backend_metal_buffer_context * ctx = g_backend->malloc(sizeof(struct ggml_backend_metal_buffer_context)); ctx->all_data = data; ctx->all_size = size; @@ -2896,36 +2687,41 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); #endif - return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); + printf("hey %p\n", g_backend); + printf("START return g_backend->ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);\n"); + // dope + ggml_backend_buffer_t x = g_backend->ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); + printf(" END return g_backend->ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);\n"); + return x; } // backend -GGML_BACKEND_ABI static const char * ggml_backend_metal_name(ggml_backend_t backend) { +GGML_ABI static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; UNUSED(backend); } -GGML_BACKEND_ABI static void ggml_backend_metal_free(ggml_backend_t backend) { +GGML_ABI static void ggml_backend_metal_free(ggml_backend_t backend) { struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; ggml_metal_free(ctx); - free(backend); + g_backend->free(backend); } -GGML_BACKEND_ABI static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { +GGML_ABI static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_metal_buffer_type(); UNUSED(backend); } -GGML_BACKEND_ABI static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +GGML_ABI static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; ggml_metal_graph_compute(metal_ctx, cgraph); } -GGML_BACKEND_ABI static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { +GGML_ABI static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { return ggml_metal_supports_op(op); UNUSED(backend); @@ -2954,7 +2750,7 @@ ggml_backend_t ggml_backend_metal_init(void) { return NULL; } - ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); + ggml_backend_t metal_backend = g_backend->malloc(sizeof(struct ggml_backend)); *metal_backend = (struct ggml_backend) { /* .interface = */ metal_backend_i, @@ -2983,12 +2779,3 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; } - -ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning - -ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) { - return ggml_backend_metal_init(); - - GGML_UNUSED(params); - GGML_UNUSED(user_data); -} diff --git a/llama.cpp/ggml.c b/llama.cpp/ggml.c index 7b6ec9e44c..f6e4187d29 100644 --- a/llama.cpp/ggml.c +++ b/llama.cpp/ggml.c @@ -2003,19 +2003,19 @@ void ggml_print_objects(const struct ggml_context * ctx) { GGML_PRINT("%s: --- end ---\n", __func__); } -int64_t ggml_nelements(const struct ggml_tensor * tensor) { +GGML_ABI int64_t ggml_nelements(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -int64_t ggml_nrows(const struct ggml_tensor * tensor) { +GGML_ABI int64_t ggml_nrows(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -size_t ggml_nbytes(const struct ggml_tensor * tensor) { +GGML_ABI size_t ggml_nbytes(const struct ggml_tensor * tensor) { size_t nbytes; size_t blck_size = ggml_blck_size(tensor->type); if (blck_size == 1) { @@ -2038,15 +2038,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); } -int ggml_blck_size(enum ggml_type type) { +GGML_ABI int ggml_blck_size(enum ggml_type type) { return type_traits[type].blck_size; } -size_t ggml_type_size(enum ggml_type type) { +GGML_ABI size_t ggml_type_size(enum ggml_type type) { return type_traits[type].type_size; } -size_t ggml_row_size(enum ggml_type type, int64_t ne) { +GGML_ABI size_t ggml_row_size(enum ggml_type type, int64_t ne) { assert(ne % ggml_blck_size(type) == 0); return ggml_type_size(type)*ne/ggml_blck_size(type); } @@ -2055,15 +2055,15 @@ double ggml_type_sizef(enum ggml_type type) { return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } -const char * ggml_type_name(enum ggml_type type) { +GGML_ABI const char * ggml_type_name(enum ggml_type type) { return type_traits[type].type_name; } -bool ggml_is_quantized(enum ggml_type type) { +GGML_ABI bool ggml_is_quantized(enum ggml_type type) { return type_traits[type].is_quantized; } -const char * ggml_op_name(enum ggml_op op) { +GGML_ABI const char * ggml_op_name(enum ggml_op op) { return GGML_OP_NAME[op]; } @@ -2075,7 +2075,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) { return GGML_UNARY_OP_NAME[op]; } -const char * ggml_op_desc(const struct ggml_tensor * t) { +GGML_ABI const char * ggml_op_desc(const struct ggml_tensor * t) { if (t->op == GGML_OP_UNARY) { enum ggml_unary_op uop = ggml_get_unary_op(t); return ggml_unary_op_name(uop); @@ -2085,7 +2085,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) { } } -size_t ggml_element_size(const struct ggml_tensor * tensor) { +GGML_ABI size_t ggml_element_size(const struct ggml_tensor * tensor) { return ggml_type_size(tensor->type); } @@ -2165,11 +2165,11 @@ size_t ggml_tensor_overhead(void) { return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE; } -bool ggml_is_transposed(const struct ggml_tensor * tensor) { +GGML_ABI bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } -bool ggml_is_contiguous(const struct ggml_tensor * tensor) { +GGML_ABI bool ggml_is_contiguous(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return @@ -2188,7 +2188,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -bool ggml_is_permuted(const struct ggml_tensor * tensor) { +GGML_ABI bool ggml_is_permuted(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; @@ -3085,7 +3085,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) { return (float *)(tensor->data); } -enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { +GGML_ABI enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { GGML_ASSERT(tensor->op == GGML_OP_UNARY); return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0); } @@ -11617,7 +11617,7 @@ static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, fl return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); } -void ggml_rope_yarn_corr_dims( +GGML_ABI void ggml_rope_yarn_corr_dims( int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2] ) { // start and end correction dims diff --git a/llama.cpp/ggml.h b/llama.cpp/ggml.h index accc942c2b..c416821e2c 100644 --- a/llama.cpp/ggml.h +++ b/llama.cpp/ggml.h @@ -175,6 +175,12 @@ // // +#if defined(_WIN32) +# define GGML_ABI +#else +# define GGML_ABI __attribute__((__ms_abi__)) +#endif + #ifdef GGML_SHARED # if defined(_WIN32) && !defined(__MINGW32__) # ifdef GGML_BUILD @@ -646,36 +652,36 @@ extern "C" { GGML_API void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_print_objects(const struct ggml_context * ctx); - GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); - GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); - GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); + GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor) GGML_ABI; + GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor) GGML_ABI; + GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor) GGML_ABI; GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN - GGML_API int ggml_blck_size(enum ggml_type type); - GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block - GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row + GGML_API int ggml_blck_size(enum ggml_type type) GGML_ABI; + GGML_API size_t ggml_type_size(enum ggml_type type) GGML_ABI; // size in bytes for all elements in a block + GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne) GGML_ABI; // size in bytes for all elements in a row GGML_DEPRECATED( GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float "use ggml_row_size() instead"); - GGML_API const char * ggml_type_name(enum ggml_type type); - GGML_API const char * ggml_op_name (enum ggml_op op); + GGML_API const char * ggml_type_name(enum ggml_type type) GGML_ABI; + GGML_API const char * ggml_op_name (enum ggml_op op) GGML_ABI; GGML_API const char * ggml_op_symbol(enum ggml_op op); GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); - GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name + GGML_API const char * ggml_op_desc(const struct ggml_tensor * t) GGML_ABI; // unary or op name - GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); + GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor) GGML_ABI; - GGML_API bool ggml_is_quantized(enum ggml_type type); + GGML_API bool ggml_is_quantized(enum ggml_type type) GGML_ABI; // TODO: temporary until model loading of ggml examples is refactored GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); - GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); - GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); - GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor) GGML_ABI; + GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor) GGML_ABI; + GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor) GGML_ABI; GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); @@ -767,7 +773,7 @@ extern "C" { GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); - GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); + GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) GGML_ABI; GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); @@ -1417,7 +1423,7 @@ extern "C" { // compute correction dims for YaRN RoPE scaling void ggml_rope_yarn_corr_dims( - int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]); + int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]) GGML_ABI; // xPos RoPE, in-place, returns view(a) GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( diff --git a/llama.cpp/llava/clip.cpp b/llama.cpp/llava/clip.cpp index 1ca7351bfa..43b7e0e9af 100644 --- a/llama.cpp/llava/clip.cpp +++ b/llama.cpp/llava/clip.cpp @@ -23,7 +23,6 @@ #include "llama.cpp/ggml-cuda.h" #include "llama.cpp/ggml-metal.h" #include "llama.cpp/ggml-alloc.h" -#include "llama.cpp/ggml-backend.h" #define STB_IMAGE_IMPLEMENTATION #include "llama.cpp/stb_image.h" diff --git a/llamafile/cuda.c b/llamafile/cuda.c index efe94bed8e..bb137ac543 100644 --- a/llamafile/cuda.c +++ b/llamafile/cuda.c @@ -50,6 +50,7 @@ __static_yoink("llama.cpp/ggml-backend-impl.h"); #define THESTRING(x) #x #define STRINGIFY(x) THESTRING(x) #define WIND_ONLY(x) (!IsWindows() ? "-DIGNORE" STRINGIFY(__COUNTER__) : x) +#define ARMS_ONLY(x) (!IsAarch64() ? "-DIGNORE" STRINGIFY(__COUNTER__) : x) #define BLAS_ONLY(x) (FLAG_tinyblas ? "-DIGNORE" STRINGIFY(__COUNTER__) : x) #define NVCC_LIBS BLAS_ONLY("-lcublas"), "-lcuda" @@ -59,7 +60,9 @@ __static_yoink("llama.cpp/ggml-backend-impl.h"); "-use_fast_math", \ "--compiler-options", \ (!IsWindows() \ - ? "-fPIC -O3 -march=native -mtune=native" \ + ? (!IsAarch64() \ + ? "-fPIC -O3 -march=native -mtune=native" \ + : "-fPIC -O3 -march=native -mtune=native -ffixed-x28") \ : "/nologo /EHsc /O2 /GR /MT"), \ "-DNDEBUG", \ "-DGGML_BUILD=1", \ @@ -92,9 +95,9 @@ static const struct Source { }; static struct Cuda { - bool disabled; bool supported; atomic_uint once; + typeof(ggml_cuda_link) *ggml_cuda_link; typeof(ggml_init_cublas) *init; typeof(ggml_cublas_loaded) *loaded; typeof(ggml_cuda_host_free) *host_free; @@ -505,6 +508,7 @@ static bool CompileAmd(const char *clangxx, const char *dso, const char *src) { "-D_DLL", "-D_MT", WIND_ONLY("-Xclang"), WIND_ONLY("--dependent-lib=msvcrt"), + ARMS_ONLY("-ffixed-x28"), "-std=gnu++14", "-mllvm", "-amdgpu-early-inline-all=true", "-mllvm", "-amdgpu-function-calls=false", @@ -601,6 +605,7 @@ static bool LinkCudaDso(const char *dso, const char *dir) { // import functions bool ok = true; + ok &= !!(ggml_cuda.ggml_cuda_link = cosmo_dlsym(lib, "ggml_cuda_link")); ok &= !!(ggml_cuda.init = cosmo_dlsym(lib, "ggml_init_cublas")); ok &= !!(ggml_cuda.loaded = cosmo_dlsym(lib, "ggml_cublas_loaded")); ok &= !!(ggml_cuda.host_free = cosmo_dlsym(lib, "ggml_cuda_host_free")); @@ -625,11 +630,12 @@ static bool LinkCudaDso(const char *dso, const char *dir) { ok &= !!(ggml_cuda.backend_host_buffer_type = cosmo_dlsym(lib, "ggml_backend_cuda_host_buffer_type")); ok &= !!(ggml_cuda.backend_init = cosmo_dlsym(lib, "ggml_backend_cuda_init")); if (!ok) { - tinylog(Dlerror(), ": not all symbols could be imported\n", NULL); + tinylog("error: not all cuda symbols could be imported\n", NULL); return false; } // ask the library if actual gpu devices exist + ggml_cuda.ggml_cuda_link(ggml_backend_api()); ggml_cuda.init(); return ggml_cuda.loaded(); } @@ -824,16 +830,12 @@ static bool ImportCudaImpl(void) { } static void ImportCuda(void) { - if (!ggml_cuda.disabled && ImportCudaImpl()) { + if (ImportCudaImpl()) { tinylog("GPU support successfully linked and loaded\n", NULL); ggml_cuda.supported = true; } } -void ggml_cuda_disable(void) { - ggml_cuda.disabled = true; -} - bool ggml_cuda_supported(void) { cosmo_once(&ggml_cuda.once, ImportCuda); return ggml_cuda.supported; diff --git a/llamafile/metal.c b/llamafile/metal.c index 7beba4a872..fa882b7b53 100644 --- a/llamafile/metal.c +++ b/llamafile/metal.c @@ -29,6 +29,7 @@ #include #include #include "llamafile/log.h" +#include "llamafile/llamafile.h" #include "llama.cpp/ggml-metal.h" __static_yoink("llama.cpp/ggml.h"); @@ -42,8 +43,6 @@ __static_yoink("llama.cpp/ggml-backend.h"); __static_yoink("llama.cpp/ggml-metal.metal"); __static_yoink("llama.cpp/ggml-backend-impl.h"); -ggml_backend_t ggml_backend_reg_metal_init(const char *, void *); - static const struct Source { const char *zip; const char *name; @@ -63,6 +62,7 @@ static const struct Source { static struct Metal { bool supported; atomic_uint once; + typeof(ggml_metal_link) *ggml_metal_link; typeof(ggml_metal_add_buffer) *add_buffer; typeof(ggml_metal_free) *free; typeof(ggml_metal_get_concur_list) *get_concur_list; @@ -75,7 +75,6 @@ static struct Metal { typeof(ggml_metal_set_n_cb) *set_n_cb; typeof(ggml_backend_metal_init) *backend_init; typeof(ggml_backend_metal_buffer_type) *backend_buffer_type; - typeof(ggml_backend_reg_metal_init) *backend_reg_init; typeof(ggml_backend_metal_buffer_from_ptr) *backend_buffer_from_ptr; typeof(ggml_backend_is_metal) *backend_is_metal; typeof(ggml_backend_metal_set_n_cb) *backend_set_n_cb; @@ -122,7 +121,7 @@ static bool BuildMetal(const char *dso) { } // determine if we need to build - if (!needs_rebuild) { + if (!needs_rebuild || FLAG_recompile) { switch (llamafile_is_file_newer_than(src, dso)) { case -1: return false; @@ -158,6 +157,7 @@ static bool BuildMetal(const char *dso) { "-DNDEBUG", "-fPIC", "-pthread", + "-ffixed-x28", // cosmo's tls register src, "-o", tmpdso, "-framework", "Foundation", @@ -205,6 +205,7 @@ static bool LinkMetal(const char *dso) { // import functions bool ok = true; + ok &= !!(ggml_metal.ggml_metal_link = cosmo_dlsym(lib, "ggml_metal_link")); ok &= !!(ggml_metal.add_buffer = cosmo_dlsym(lib, "ggml_metal_add_buffer")); ok &= !!(ggml_metal.free = cosmo_dlsym(lib, "ggml_metal_free")); ok &= !!(ggml_metal.get_concur_list = cosmo_dlsym(lib, "ggml_metal_get_concur_list")); @@ -217,7 +218,6 @@ static bool LinkMetal(const char *dso) { ok &= !!(ggml_metal.set_n_cb = cosmo_dlsym(lib, "ggml_metal_set_n_cb")); ok &= !!(ggml_metal.backend_init = cosmo_dlsym(lib, "ggml_backend_metal_init")); ok &= !!(ggml_metal.backend_buffer_type = cosmo_dlsym(lib, "ggml_backend_metal_buffer_type")); - ok &= !!(ggml_metal.backend_reg_init = cosmo_dlsym(lib, "ggml_backend_reg_metal_init")); ok &= !!(ggml_metal.backend_buffer_from_ptr = cosmo_dlsym(lib, "ggml_backend_metal_buffer_from_ptr")); ok &= !!(ggml_metal.backend_is_metal = cosmo_dlsym(lib, "ggml_backend_is_metal")); ok &= !!(ggml_metal.backend_set_n_cb = cosmo_dlsym(lib, "ggml_backend_metal_set_n_cb")); @@ -227,6 +227,7 @@ static bool LinkMetal(const char *dso) { } // we're good + ggml_metal.ggml_metal_link(ggml_backend_api()); return true; } @@ -265,7 +266,6 @@ static bool ImportMetalImpl(void) { static void ImportMetal(void) { if (ImportMetalImpl()) { ggml_metal.supported = true; - ggml_metal.backend_init(); tinylog("Apple Metal GPU support successfully loaded\n", NULL); } } @@ -340,11 +340,6 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { return ggml_metal.backend_buffer_type(); } -ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) { - if (!ggml_metal_supported()) return 0; - return ggml_metal.backend_reg_init(params, user_data); -} - ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { if (!ggml_metal_supported()) return 0; return ggml_metal.backend_buffer_from_ptr(data, size, max_size);