diff --git a/android/src/main/java/com/rnwhisper/WhisperContext.java b/android/src/main/java/com/rnwhisper/WhisperContext.java index cd8889b..17740a4 100644 --- a/android/src/main/java/com/rnwhisper/WhisperContext.java +++ b/android/src/main/java/com/rnwhisper/WhisperContext.java @@ -474,6 +474,8 @@ private int full(int jobId, ReadableMap options, float[] audioData, int audioDat options.hasKey("speedUp") ? options.getBoolean("speedUp") : false, // jboolean translate, options.hasKey("translate") ? options.getBoolean("translate") : false, + // jboolean tdrz_enable, + options.hasKey("tdrzEnable") ? options.getBoolean("tdrzEnable") : false, // jstring language, options.hasKey("language") ? options.getString("language") : "auto", // jstring prompt @@ -645,6 +647,7 @@ protected static native int fullTranscribe( int best_of, boolean speed_up, boolean translate, + boolean tdrz_enable, String language, String prompt, ProgressCallback progressCallback diff --git a/android/src/main/jni.cpp b/android/src/main/jni.cpp index 20e6f02..90e2ea5 100644 --- a/android/src/main/jni.cpp +++ b/android/src/main/jni.cpp @@ -232,6 +232,7 @@ Java_com_rnwhisper_WhisperContext_fullTranscribe( jint best_of, jboolean speed_up, jboolean translate, + jboolean tdrz_enable, jstring language, jstring prompt, jobject progress_callback_instance @@ -256,7 +257,7 @@ Java_com_rnwhisper_WhisperContext_fullTranscribe( params.print_realtime = false; params.print_progress = false; params.print_timestamps = false; - params.print_special = false; + params.print_special = true; params.translate = translate; const char *language_chars = env->GetStringUTFChars(language, nullptr); params.language = language_chars; @@ -265,6 +266,7 @@ Java_com_rnwhisper_WhisperContext_fullTranscribe( params.offset_ms = 0; params.no_context = true; params.single_segment = false; + params.tdrz_enable = tdrz_enable; if (max_len > -1) { params.max_len = max_len; diff --git a/cpp/coreml/whisper-encoder.mm b/cpp/coreml/whisper-encoder.mm index 6cd90ed..499edae 100644 --- a/cpp/coreml/whisper-encoder.mm +++ b/cpp/coreml/whisper-encoder.mm @@ -22,7 +22,13 @@ NSURL * url_model = [NSURL fileURLWithPath: path_model_str]; - const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model error:nil]); + // select which device to run the Core ML model on + MLModelConfiguration *config = [[MLModelConfiguration alloc] init]; + config.computeUnits = MLComputeUnitsCPUAndGPU; + //config.computeUnits = MLComputeUnitsCPUAndNeuralEngine; + //config.computeUnits = MLComputeUnitsAll; + + const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]); if (data == NULL) { return NULL; diff --git a/cpp/ggml-alloc.c b/cpp/ggml-alloc.c new file mode 100644 index 0000000..8c70ea4 --- /dev/null +++ b/cpp/ggml-alloc.c @@ -0,0 +1,633 @@ +#include "ggml-alloc.h" +#include "ggml.h" +#include +#include +#include +#include +#include + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #include +#endif + + +#define UNUSED(x) (void)(x) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define GGML_MAX_CONCUR (2*GGML_MAX_NODES) + +//#define GGML_ALLOCATOR_DEBUG + +//#define AT_PRINTF printf +#define AT_PRINTF(...) ((void)0) + +struct hash_node { + struct ggml_tensor * t; + int n_children; + int n_views; +}; + +static size_t hash(void * p) { + return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; +} + +static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) { + size_t h = hash(t); + + // linear probing + size_t i = h; + while (hash_table[i].t != NULL) { + if (hash_table[i].t == t) { + return &hash_table[i]; + } + i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE; + if (i == h) { + // hash table is full + GGML_ASSERT(false); + } + } + + hash_table[i].t = t; + return &hash_table[i]; +} + +// TODO: GGML_PAD ? +static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { + assert(alignment && !(alignment & (alignment - 1))); // power of 2 + size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment; + return offset + align; +} + +struct free_block { + void * addr; + size_t size; +}; + +#define MAX_FREE_BLOCKS 128 + +struct ggml_allocr { + void * data; + size_t size; + size_t alignment; + int n_free_blocks; + struct free_block free_blocks[MAX_FREE_BLOCKS]; + struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE]; + size_t max_size; + bool measure; + int parse_seq[GGML_MAX_CONCUR]; + int parse_seq_len; + +#ifdef GGML_ALLOCATOR_DEBUG + struct ggml_tensor * allocated_tensors[1024]; +#endif +}; + +#ifdef GGML_ALLOCATOR_DEBUG +static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i] == NULL) { + alloc->allocated_tensors[i] = tensor; + return; + } + } + GGML_ASSERT(!"out of allocated_tensors"); +} +static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i] == tensor || + (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) { + alloc->allocated_tensors[i] = NULL; + return; + } + } + printf("tried to free tensor %s not found\n", tensor->name); + GGML_ASSERT(!"tensor not found"); +} +#endif + +static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { + return ggml_nbytes(tensor); + + UNUSED(alloc); +} + +// check if a tensor is allocated by this buffer +static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) { + void * ptr = tensor->data; + return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size; +} + +static bool ggml_is_view(struct ggml_tensor * t) { + return t->view_src != NULL; +} + +void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { +#ifdef GGML_ALLOCATOR_DEBUG + GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources + GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated +#endif + size_t size = ggml_allocr_get_alloc_size(alloc, tensor); + size = aligned_offset(NULL, size, alloc->alignment); + + AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); + + size_t max_avail = 0; + + // find the best fitting free block besides the last block + int best_fit_block = -1; + size_t best_fit_size = SIZE_MAX; + for (int i = 0; i < alloc->n_free_blocks - 1; i++) { + struct free_block * block = &alloc->free_blocks[i]; + max_avail = MAX(max_avail, block->size); + if (block->size >= size && block->size <= best_fit_size) { + best_fit_block = i; + best_fit_size = block->size; + } + } + + AT_PRINTF("block %d\n", best_fit_block); + + if (best_fit_block == -1) { + // the last block is our last resort + struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; + max_avail = MAX(max_avail, block->size); + if (block->size >= size) { + best_fit_block = alloc->n_free_blocks - 1; + } else { + fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", + __func__, size, max_avail); + GGML_ASSERT(!"not enough space in the buffer"); + return; + } + } + struct free_block * block = &alloc->free_blocks[best_fit_block]; + void * addr = block->addr; + block->addr = (char*)block->addr + size; + block->size -= size; + if (block->size == 0) { + // remove block if empty + alloc->n_free_blocks--; + for (int j = best_fit_block; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + + tensor->data = addr; + +#ifdef GGML_ALLOCATOR_DEBUG + add_allocated_tensor(alloc, tensor); + size_t cur_max = (char*)addr - (char*)alloc->data + size; + if (cur_max > alloc->max_size) { + printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); + for (int i = 0; i < 1024; i++) { + if (alloc->allocated_tensors[i]) { + printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0); + } + } + printf("\n"); + } +#endif + + alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size); +} + +// this is a very naive implementation, but for our case the number of free blocks should be very small +static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { + void * ptr = tensor->data; + + if (ggml_allocr_is_own(alloc, tensor) == false) { + // the tensor was not allocated in this buffer + // this can happen because the graph allocator will try to free weights and other tensors from different buffers + // the easiest way to deal with this is just to ignore it + return; + } + + size_t size = ggml_allocr_get_alloc_size(alloc, tensor); + size = aligned_offset(NULL, size, alloc->alignment); + AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks); + +#ifdef GGML_ALLOCATOR_DEBUG + remove_allocated_tensor(alloc, tensor); +#endif + + // see if we can merge with an existing block + for (int i = 0; i < alloc->n_free_blocks; i++) { + struct free_block * block = &alloc->free_blocks[i]; + // check if ptr is at the end of the block + if ((char*)block->addr + block->size == ptr) { + block->size += size; + // check if we can merge with the next block + if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) { + block->size += alloc->free_blocks[i+1].size; + alloc->n_free_blocks--; + for (int j = i+1; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + return; + } + // check if ptr is at the beginning of the block + if ((char*)ptr + size == block->addr) { + block->addr = ptr; + block->size += size; + // check if we can merge with the previous block + if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) { + alloc->free_blocks[i-1].size += block->size; + alloc->n_free_blocks--; + for (int j = i; j < alloc->n_free_blocks; j++) { + alloc->free_blocks[j] = alloc->free_blocks[j+1]; + } + } + return; + } + } + // otherwise, add a new block + GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks"); + // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster) + int insert_pos = 0; + while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) { + insert_pos++; + } + // shift all blocks from insert_pos onward to make room for the new block + for (int i = alloc->n_free_blocks; i > insert_pos; i--) { + alloc->free_blocks[i] = alloc->free_blocks[i-1]; + } + // insert the new block + alloc->free_blocks[insert_pos].addr = ptr; + alloc->free_blocks[insert_pos].size = size; + alloc->n_free_blocks++; +} + +void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) { + for (int i = 0; i < n; i++) { + alloc->parse_seq[i] = list[i]; + } + alloc->parse_seq_len = n; +} + +void ggml_allocr_reset(struct ggml_allocr * alloc) { + alloc->n_free_blocks = 1; + size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); + alloc->free_blocks[0].addr = (char *)alloc->data + align_offset; + alloc->free_blocks[0].size = alloc->size - align_offset; +} + +struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) { + struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); + + *alloc = (struct ggml_allocr){ + /*.data = */ data, + /*.size = */ size, + /*.alignment = */ alignment, + /*.n_free_blocks = */ 0, + /*.free_blocks = */ {{0}}, + /*.hash_table = */ {{0}}, + /*.max_size = */ 0, + /*.measure = */ false, + /*.parse_seq = */ {0}, + /*.parse_seq_len = */ 0, +#ifdef GGML_ALLOCATOR_DEBUG + /*.allocated_tensors = */ {0}, +#endif + }; + + ggml_allocr_reset(alloc); + + return alloc; +} + +// OS specific functions to allocate and free uncommitted virtual memory +static void * alloc_vmem(size_t size) { +#if defined(_WIN32) + return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS); +#elif defined(_POSIX_MAPPED_FILES) + void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (ptr == MAP_FAILED) { + return NULL; + } + return ptr; +#else + // use a fixed address for other platforms + uintptr_t base_addr = (uintptr_t)-size - 0x100; + return (void *)base_addr; +#endif +} + +static void free_vmem(void * base_addr, size_t size) { +#if defined(_WIN32) + VirtualFree(base_addr, 0, MEM_RELEASE); + UNUSED(size); +#elif defined(_POSIX_MAPPED_FILES) + munmap(base_addr, size); +#else + // nothing to do + UNUSED(base_addr); + UNUSED(size); +#endif +} + +// allocate uncommitted virtual memory to measure the size of the graph +static void alloc_measure_vmem(void ** base_addr, size_t * size) { + // 128GB for 64-bit, 1GB for 32-bit + *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37; + do { + *base_addr = alloc_vmem(*size); + if (*base_addr != NULL) { + AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr); + return; + } + // try again with half the size + *size /= 2; + } while (*size > 0); + + GGML_ASSERT(!"failed to allocate virtual memory for measure buffer"); +} + +static void free_measure_vmem(void * base_addr, size_t size) { + free_vmem(base_addr, size); +} + +struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { + struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); + + void * base_addr; + size_t size; + + alloc_measure_vmem(&base_addr, &size); + + *alloc = (struct ggml_allocr){ + /*.data = */ base_addr, + /*.size = */ size, + /*.alignment = */ alignment, + /*.n_free_blocks = */ 0, + /*.free_blocks = */ {{0}}, + /*.hash_table = */ {{0}}, + /*.max_size = */ 0, + /*.measure = */ true, + /*.parse_seq = */ {0}, + /*.parse_seq_len = */ 0, +#ifdef GGML_ALLOCATOR_DEBUG + /*.allocated_tensors = */ {0}, +#endif + }; + + ggml_allocr_reset(alloc); + + return alloc; +} + +void ggml_allocr_free(struct ggml_allocr * alloc) { + if (alloc->measure) { + free_measure_vmem(alloc->data, alloc->size); + } + free(alloc); +} + +bool ggml_allocr_is_measure(struct ggml_allocr * alloc) { + return alloc->measure; +} + +//////////// compute graph allocator + +static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { + if (a->type != b->type) { + return false; + } + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (a->ne[i] != b->ne[i]) { + return false; + } + if (a->nb[i] != b->nb[i]) { + return false; + } + } + return true; +} + +static bool ggml_op_can_inplace(enum ggml_op op) { + switch (op) { + case GGML_OP_SCALE: + case GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_ADD: + case GGML_OP_ADD1: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_UNARY: + case GGML_OP_ROPE: + case GGML_OP_RMS_NORM: + case GGML_OP_SOFT_MAX: + case GGML_OP_CONT: + return true; + + default: + return false; + } +} + +static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) { + struct hash_node * ht = alloc->hash_table; + if (node->data == NULL) { + if (ggml_is_view(node)) { + assert(node->view_src->data != NULL); + node->data = (char *)node->view_src->data + node->view_offs; + } else { + // see if we can reuse a parent's buffer (inplace) + if (ggml_op_can_inplace(node->op)) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + struct ggml_tensor * parent = node->src[i]; + if (parent == NULL) { + break; + } + + // if the node's data is external, then we cannot re-use it + if (ggml_allocr_is_own(alloc, parent) == false) { + AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); + continue; + } + + struct hash_node * p_hn = hash_get(ht, parent); + if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = hash_get(ht, view_src); + if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { + // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite + // the parent's data that it will need later (same layout requirement). the problem is that then + // we cannot free the tensor because the original address of the allocation is lost. + // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views + // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) + AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); + node->data = parent->data; + return; + } + } + else { + AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); + node->data = parent->data; + return; + } + } + } + } + ggml_allocr_alloc(alloc, node); + } + } +} + +static size_t ggml_allocr_alloc_graph_tensors_n( + struct ggml_allocr * alloc, + struct ggml_cgraph ** graphs, int n_graphs, + struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) { + + // reset hash table + struct hash_node * ht = alloc->hash_table; + memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE); + + // count number of children and views + for (int g = 0; g < n_graphs; g++) { + struct ggml_cgraph * gf = graphs[g]; + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; + + if (ggml_is_view(node)) { + struct ggml_tensor * view_src = node->view_src; + hash_get(ht, view_src)->n_views += 1; + } + + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + hash_get(ht, parent)->n_children += 1; + } + } + } + + // allocate tensors + for (int g = 0; g < n_graphs; g++) { + struct ggml_cgraph * gf = graphs[g]; + AT_PRINTF("####### graph %d/%d\n", g, n_graphs); + // graph inputs are allocated first to ensure that they are not overwritten by each other + if (inputs != NULL && inputs[g] != NULL) { + for (int i = 0; inputs[g][i] != NULL; i++) { + struct ggml_tensor * input = inputs[g][i]; + AT_PRINTF("input: %s\n", input->name); + allocate_node(alloc, input); + } + } + // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers + int last_barrier_pos = 0; + int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes; + + for (int ind = 0; ind < n_nodes; ind++) { + // allocate a node if there is no parse_seq or this is not a barrier + if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) { + int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind; + struct ggml_tensor * node = gf->nodes[i]; + + // allocate parents (leafs) + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + allocate_node(alloc, parent); + } + + // allocate node + allocate_node(alloc, node); + + AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + AT_PRINTF("%s", parent->name); + if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { + AT_PRINTF(", "); + } + } + AT_PRINTF("\n"); + } + + // update parents + // update immediately if there is no parse_seq + // update only at barriers if there is parse_seq + if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) { + int update_start = alloc->parse_seq_len ? last_barrier_pos : ind; + int update_end = alloc->parse_seq_len ? ind : ind + 1; + for (int i = update_start; i < update_end; i++) { + int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i; + struct ggml_tensor * node = gf->nodes[node_i]; + + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + struct hash_node * p_hn = hash_get(ht, parent); + p_hn->n_children -= 1; + + //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); + + if (p_hn->n_children == 0 && p_hn->n_views == 0) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = hash_get(ht, view_src); + view_src_hn->n_views -= 1; + AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); + if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { + ggml_allocr_free_tensor(alloc, view_src); + } + } + else { + if (parent->data != node->data) { + ggml_allocr_free_tensor(alloc, parent); + } + } + } + } + } + AT_PRINTF("\n"); + if (alloc->parse_seq_len) { + last_barrier_pos = ind + 1; + } + } + } + // free graph outputs here that wouldn't be freed otherwise because they have no children + if (outputs != NULL && outputs[g] != NULL) { + for (int i = 0; outputs[g][i] != NULL; i++) { + struct ggml_tensor * output = outputs[g][i]; + AT_PRINTF("output: %s\n", output->name); + ggml_allocr_free_tensor(alloc, output); + } + } + } + + return alloc->max_size; +} + +size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) { + return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); +} \ No newline at end of file diff --git a/cpp/ggml-alloc.h b/cpp/ggml-alloc.h new file mode 100644 index 0000000..f0d3648 --- /dev/null +++ b/cpp/ggml-alloc.h @@ -0,0 +1,26 @@ +#pragma once + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + + +GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); +GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); + +// tell the allocator to parse nodes following the order described in the list +// you should call this if your graph are optimized to execute out-of-order +GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n); + +GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); +GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); +GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); +GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor); +GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph); + + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/cpp/ggml.c b/cpp/ggml.c index 4e2695d..2a806c7 100644 --- a/cpp/ggml.c +++ b/cpp/ggml.c @@ -1,9 +1,8 @@ -#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows #include "ggml.h" -#ifdef WSP_GGML_USE_K_QUANTS +#ifdef GGML_USE_K_QUANTS #include "k_quants.h" #endif @@ -25,21 +24,32 @@ #include #include #include +#include -#ifdef WSP_GGML_USE_METAL +#ifdef GGML_USE_METAL #include #endif +// static_assert should be a #define, but if it's not, +// fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 #ifndef static_assert +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) +#define static_assert(cond, msg) _Static_assert(cond, msg) +#else #define static_assert(cond, msg) struct global_scope_noop_trick #endif +#endif #if defined(_MSC_VER) // disable "possible loss of data" to avoid hundreds of casts // we should just be careful :) #pragma warning(disable: 4244 4267) + +// disable POSIX deprecation warnigns +// these functions are never going away, anyway +#pragma warning(disable: 4996) #endif #if defined(_WIN32) @@ -49,23 +59,23 @@ typedef volatile LONG atomic_int; typedef atomic_int atomic_bool; -static void atomic_store(atomic_int* ptr, LONG val) { +static void atomic_store(atomic_int * ptr, LONG val) { InterlockedExchange(ptr, val); } -static LONG atomic_load(atomic_int* ptr) { +static LONG atomic_load(atomic_int * ptr) { return InterlockedCompareExchange(ptr, 0, 0); } -static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) { +static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) { return InterlockedExchangeAdd(ptr, inc); } -static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) { +static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) { return atomic_fetch_add(ptr, -(dec)); } typedef HANDLE pthread_t; typedef DWORD thread_ret_t; -static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { +static int pthread_create(pthread_t * out, void * unused, thread_ret_t(*func)(void *), void * arg) { (void) unused; HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); if (handle == NULL) @@ -77,7 +87,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void return 0; } -static int pthread_join(pthread_t thread, void* unused) { +static int pthread_join(pthread_t thread, void * unused) { (void) unused; return (int) WaitForSingleObject(thread, INFINITE); } @@ -90,12 +100,15 @@ static int sched_yield (void) { #include #include -typedef void* thread_ret_t; +typedef void * thread_ret_t; #include #include #include +#endif +#ifdef GGML_USE_CPU_HBM +#include #endif // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 @@ -111,93 +124,91 @@ typedef void* thread_ret_t; #endif #endif -#ifdef __HAIKU__ -#define static_assert(cond, msg) _Static_assert(cond, msg) -#endif - -/*#define WSP_GGML_PERF*/ -#define WSP_GGML_DEBUG 0 -#define WSP_GGML_GELU_FP16 -#define WSP_GGML_GELU_QUICK_FP16 -#define WSP_GGML_SILU_FP16 +/*#define GGML_PERF*/ +#define GGML_DEBUG 0 +#define GGML_GELU_FP16 +#define GGML_GELU_QUICK_FP16 +#define GGML_SILU_FP16 +// #define GGML_CROSS_ENTROPY_EXP_FP16 +// #define GGML_FLASH_ATTN_EXP_FP16 -#define WSP_GGML_SOFT_MAX_UNROLL 4 -#define WSP_GGML_VEC_DOT_UNROLL 2 +#define GGML_SOFT_MAX_UNROLL 4 +#define GGML_VEC_DOT_UNROLL 2 // // logging // -#if (WSP_GGML_DEBUG >= 1) -#define WSP_GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#if (GGML_DEBUG >= 1) +#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) #else -#define WSP_GGML_PRINT_DEBUG(...) +#define GGML_PRINT_DEBUG(...) #endif -#if (WSP_GGML_DEBUG >= 5) -#define WSP_GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) +#if (GGML_DEBUG >= 5) +#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) #else -#define WSP_GGML_PRINT_DEBUG_5(...) +#define GGML_PRINT_DEBUG_5(...) #endif -#if (WSP_GGML_DEBUG >= 10) -#define WSP_GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) +#if (GGML_DEBUG >= 10) +#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) #else -#define WSP_GGML_PRINT_DEBUG_10(...) +#define GGML_PRINT_DEBUG_10(...) #endif -#define WSP_GGML_PRINT(...) printf(__VA_ARGS__) +#define GGML_PRINT(...) printf(__VA_ARGS__) -#ifdef WSP_GGML_USE_ACCELERATE +#ifdef GGML_USE_ACCELERATE // uncomment to use vDSP for soft max computation // note: not sure if it is actually faster -//#define WSP_GGML_SOFT_MAX_ACCELERATE -#endif - -#if UINTPTR_MAX == 0xFFFFFFFF - #define WSP_GGML_MEM_ALIGN 4 -#else - #define WSP_GGML_MEM_ALIGN 16 +//#define GGML_SOFT_MAX_ACCELERATE #endif // // logging // -#if (WSP_GGML_DEBUG >= 1) -#define WSP_GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) +#if (GGML_DEBUG >= 1) +#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) #else -#define WSP_GGML_PRINT_DEBUG(...) +#define GGML_PRINT_DEBUG(...) #endif -#if (WSP_GGML_DEBUG >= 5) -#define WSP_GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) +#if (GGML_DEBUG >= 5) +#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) #else -#define WSP_GGML_PRINT_DEBUG_5(...) +#define GGML_PRINT_DEBUG_5(...) #endif -#if (WSP_GGML_DEBUG >= 10) -#define WSP_GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) +#if (GGML_DEBUG >= 10) +#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) #else -#define WSP_GGML_PRINT_DEBUG_10(...) +#define GGML_PRINT_DEBUG_10(...) #endif -#define WSP_GGML_PRINT(...) printf(__VA_ARGS__) +#define GGML_PRINT(...) printf(__VA_ARGS__) // // end of logging block // #if defined(_MSC_VER) || defined(__MINGW32__) -#define WSP_GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, WSP_GGML_MEM_ALIGN) -#define WSP_GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) +#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) +#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) #else -inline static void* wsp_ggml_aligned_malloc(size_t size) { - void* aligned_memory = NULL; -#ifdef WSP_GGML_USE_METAL - int result = posix_memalign(&aligned_memory, getpagesize(), size); +inline static void * ggml_aligned_malloc(size_t size) { + if (size == 0) { + GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); + return NULL; + } + void * aligned_memory = NULL; +#ifdef GGML_USE_CPU_HBM + int result = hbw_posix_memalign(&aligned_memory, 16, size); +#elif GGML_USE_METAL + int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size); #else - int result = posix_memalign(&aligned_memory, WSP_GGML_MEM_ALIGN, size); + int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); #endif if (result != 0) { // Handle allocation failure @@ -210,47 +221,54 @@ inline static void* wsp_ggml_aligned_malloc(size_t size) { error_desc = "insufficient memory"; break; } - WSP_GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", - __func__, error_desc, size/(1024.0*1024.0)); + GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); return NULL; } return aligned_memory; } -#define WSP_GGML_ALIGNED_MALLOC(size) wsp_ggml_aligned_malloc(size) -#define WSP_GGML_ALIGNED_FREE(ptr) free(ptr) +#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) +#ifdef GGML_USE_CPU_HBM +#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr) +#else +#define GGML_ALIGNED_FREE(ptr) free(ptr) +#endif #endif -#define UNUSED WSP_GGML_UNUSED +#define UNUSED GGML_UNUSED #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) // // tensor access macros // -#define WSP_GGML_TENSOR_UNARY_OP_LOCALS \ - WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ - WSP_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ - WSP_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ - WSP_GGML_TENSOR_LOCALS(size_t, nb, dst, nb); - -#define WSP_GGML_TENSOR_BINARY_OP_LOCALS \ - WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ - WSP_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ - WSP_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \ - WSP_GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); \ - WSP_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ - WSP_GGML_TENSOR_LOCALS(size_t, nb, dst, nb); - -#if defined(WSP_GGML_USE_ACCELERATE) +#define GGML_TENSOR_UNARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + +#define GGML_TENSOR_BINARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + +#if defined(GGML_USE_ACCELERATE) #include -#if defined(WSP_GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions +#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions #include "ggml-opencl.h" #endif -#elif defined(WSP_GGML_USE_OPENBLAS) +#elif defined(GGML_USE_OPENBLAS) +#if defined(GGML_BLAS_USE_MKL) +#include +#else #include -#elif defined(WSP_GGML_USE_CUBLAS) +#endif +#elif defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" -#elif defined(WSP_GGML_USE_CLBLAST) +#elif defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" #endif @@ -260,7 +278,7 @@ inline static void* wsp_ggml_aligned_malloc(size_t size) { #define MAX(a, b) ((a) > (b) ? (a) : (b)) // floating point type used to accumulate sums -typedef double wsp_ggml_float; +typedef double ggml_float; // 16-bit float // on Arm, we use __fp16 @@ -273,11 +291,11 @@ typedef double wsp_ggml_float; // #include -#define WSP_GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) -#define WSP_GGML_COMPUTE_FP32_TO_FP16(x) (x) +#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) +#define GGML_COMPUTE_FP32_TO_FP16(x) (x) -#define WSP_GGML_FP16_TO_FP32(x) ((float) (x)) -#define WSP_GGML_FP32_TO_FP16(x) (x) +#define GGML_FP16_TO_FP32(x) ((float) (x)) +#define GGML_FP32_TO_FP16(x) (x) #else @@ -293,31 +311,37 @@ typedef double wsp_ggml_float; #include #else #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) +#if !defined(__riscv) #include #endif #endif #endif #endif +#endif + +#ifdef __riscv_v_intrinsic +#include +#endif #ifdef __F16C__ #ifdef _MSC_VER -#define WSP_GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) -#define WSP_GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) +#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) +#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) #else -#define WSP_GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) -#define WSP_GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) +#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) #endif #elif defined(__POWER9_VECTOR__) -#define WSP_GGML_COMPUTE_FP16_TO_FP32(x) wsp_ggml_compute_fp16_to_fp32(x) -#define WSP_GGML_COMPUTE_FP32_TO_FP16(x) wsp_ggml_compute_fp32_to_fp16(x) +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) /* the inline asm below is about 12% faster than the lookup method */ -#define WSP_GGML_FP16_TO_FP32(x) WSP_GGML_COMPUTE_FP16_TO_FP32(x) -#define WSP_GGML_FP32_TO_FP16(x) WSP_GGML_COMPUTE_FP32_TO_FP16(x) +#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) -static inline float wsp_ggml_compute_fp16_to_fp32(wsp_ggml_fp16_t h) { +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { register float f; register double d; __asm__( @@ -330,9 +354,9 @@ static inline float wsp_ggml_compute_fp16_to_fp32(wsp_ggml_fp16_t h) { return f; } -static inline wsp_ggml_fp16_t wsp_ggml_compute_fp32_to_fp16(float f) { +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { register double d; - register wsp_ggml_fp16_t r; + register ggml_fp16_t r; __asm__( /* xscvdphp can work on double or single precision */ "xscvdphp %0,%2\n" "mffprd %1,%0\n" : @@ -365,7 +389,7 @@ static inline uint32_t fp32_to_bits(float f) { return fp32.as_bits; } -static inline float wsp_ggml_compute_fp16_to_fp32(wsp_ggml_fp16_t h) { +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { const uint32_t w = (uint32_t) h << 16; const uint32_t sign = w & UINT32_C(0x80000000); const uint32_t two_w = w + w; @@ -388,7 +412,7 @@ static inline float wsp_ggml_compute_fp16_to_fp32(wsp_ggml_fp16_t h) { return fp32_from_bits(result); } -static inline wsp_ggml_fp16_t wsp_ggml_compute_fp32_to_fp16(float f) { +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) const float scale_to_inf = 0x1.0p+112f; const float scale_to_zero = 0x1.0p-110f; @@ -414,8 +438,8 @@ static inline wsp_ggml_fp16_t wsp_ggml_compute_fp32_to_fp16(float f) { return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); } -#define WSP_GGML_COMPUTE_FP16_TO_FP32(x) wsp_ggml_compute_fp16_to_fp32(x) -#define WSP_GGML_COMPUTE_FP32_TO_FP16(x) wsp_ggml_compute_fp32_to_fp16(x) +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) #endif // __F16C__ @@ -426,16 +450,16 @@ static inline wsp_ggml_fp16_t wsp_ggml_compute_fp32_to_fp16(float f) { // // precomputed gelu table for f16 (128 KB) -static wsp_ggml_fp16_t table_gelu_f16[1 << 16]; +static ggml_fp16_t table_gelu_f16[1 << 16]; // precomputed quick gelu table for f16 (128 KB) -static wsp_ggml_fp16_t table_gelu_quick_f16[1 << 16]; +static ggml_fp16_t table_gelu_quick_f16[1 << 16]; // precomputed silu table for f16 (128 KB) -static wsp_ggml_fp16_t table_silu_f16[1 << 16]; +static ggml_fp16_t table_silu_f16[1 << 16]; // precomputed exp table for f16 (128 KB) -static wsp_ggml_fp16_t table_exp_f16[1 << 16]; +static ggml_fp16_t table_exp_f16[1 << 16]; // precomputed f32 table for f16 (256 KB) static float table_f32_f16[1 << 16]; @@ -455,40 +479,40 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4 static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif -// On ARM NEON, it's quicker to directly convert x -> x instead of calling into wsp_ggml_lookup_fp16_to_fp32, -// so we define WSP_GGML_FP16_TO_FP32 and WSP_GGML_FP32_TO_FP16 elsewhere for NEON. +// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, +// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. // This is also true for POWER9. -#if !defined(WSP_GGML_FP16_TO_FP32) || !defined(WSP_GGML_FP32_TO_FP16) +#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) -inline static float wsp_ggml_lookup_fp16_to_fp32(wsp_ggml_fp16_t f) { +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { uint16_t s; memcpy(&s, &f, sizeof(uint16_t)); return table_f32_f16[s]; } -#define WSP_GGML_FP16_TO_FP32(x) wsp_ggml_lookup_fp16_to_fp32(x) -#define WSP_GGML_FP32_TO_FP16(x) WSP_GGML_COMPUTE_FP32_TO_FP16(x) +#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #endif // note: do not use these inside ggml.c // these are meant to be used via the ggml.h API -float wsp_ggml_fp16_to_fp32(wsp_ggml_fp16_t x) { - return (float) WSP_GGML_FP16_TO_FP32(x); +float ggml_fp16_to_fp32(ggml_fp16_t x) { + return (float) GGML_FP16_TO_FP32(x); } -wsp_ggml_fp16_t wsp_ggml_fp32_to_fp16(float x) { - return WSP_GGML_FP32_TO_FP16(x); +ggml_fp16_t ggml_fp32_to_fp16(float x) { + return GGML_FP32_TO_FP16(x); } -void wsp_ggml_fp16_to_fp32_row(const wsp_ggml_fp16_t * x, float * y, size_t n) { - for (size_t i = 0; i < n; i++) { - y[i] = WSP_GGML_FP16_TO_FP32(x[i]); +void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) { + for (int i = 0; i < n; i++) { + y[i] = GGML_FP16_TO_FP32(x[i]); } } -void wsp_ggml_fp32_to_fp16_row(const float * x, wsp_ggml_fp16_t * y, size_t n) { - size_t i = 0; +void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) { + int i = 0; #if defined(__F16C__) for (; i + 7 < n; i += 8) { __m256 x_vec = _mm256_loadu_ps(x + i); @@ -502,7 +526,7 @@ void wsp_ggml_fp32_to_fp16_row(const float * x, wsp_ggml_fp16_t * y, size_t n) { } #endif for (; i < n; i++) { - y[i] = WSP_GGML_FP32_TO_FP16(x[i]); + y[i] = GGML_FP32_TO_FP16(x[i]); } } @@ -512,7 +536,7 @@ void wsp_ggml_fp32_to_fp16_row(const float * x, wsp_ggml_fp16_t * y, size_t n) { #if defined(_MSC_VER) || defined(__MINGW32__) static int64_t timer_freq, timer_start; -void wsp_ggml_time_init(void) { +void ggml_time_init(void) { LARGE_INTEGER t; QueryPerformanceFrequency(&t); timer_freq = t.QuadPart; @@ -523,49 +547,49 @@ void wsp_ggml_time_init(void) { QueryPerformanceCounter(&t); timer_start = t.QuadPart; } -int64_t wsp_ggml_time_ms(void) { +int64_t ggml_time_ms(void) { LARGE_INTEGER t; QueryPerformanceCounter(&t); return ((t.QuadPart-timer_start) * 1000) / timer_freq; } -int64_t wsp_ggml_time_us(void) { +int64_t ggml_time_us(void) { LARGE_INTEGER t; QueryPerformanceCounter(&t); return ((t.QuadPart-timer_start) * 1000000) / timer_freq; } #else -void wsp_ggml_time_init(void) {} -int64_t wsp_ggml_time_ms(void) { +void ggml_time_init(void) {} +int64_t ggml_time_ms(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000; } -int64_t wsp_ggml_time_us(void) { +int64_t ggml_time_us(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000; } #endif -int64_t wsp_ggml_cycles(void) { +int64_t ggml_cycles(void) { return clock(); } -int64_t wsp_ggml_cycles_per_ms(void) { +int64_t ggml_cycles_per_ms(void) { return CLOCKS_PER_SEC/1000; } -#ifdef WSP_GGML_PERF -#define wsp_ggml_perf_time_ms() wsp_ggml_time_ms() -#define wsp_ggml_perf_time_us() wsp_ggml_time_us() -#define wsp_ggml_perf_cycles() wsp_ggml_cycles() -#define wsp_ggml_perf_cycles_per_ms() wsp_ggml_cycles_per_ms() +#ifdef GGML_PERF +#define ggml_perf_time_ms() ggml_time_ms() +#define ggml_perf_time_us() ggml_time_us() +#define ggml_perf_cycles() ggml_cycles() +#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms() #else -#define wsp_ggml_perf_time_ms() 0 -#define wsp_ggml_perf_time_us() 0 -#define wsp_ggml_perf_cycles() 0 -#define wsp_ggml_perf_cycles_per_ms() 0 +#define ggml_perf_time_ms() 0 +#define ggml_perf_time_us() 0 +#define ggml_perf_cycles() 0 +#define ggml_perf_cycles_per_ms() 0 #endif @@ -811,46 +835,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #if !defined(__aarch64__) -inline static uint16_t vaddvq_u8(uint8x16_t v) { - return - (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) + - (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) + - (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) + - (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) + - (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) + - (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) + - (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) + - (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15); -} - -inline static int16_t vaddvq_s8(int8x16_t v) { - return - (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) + - (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) + - (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) + - (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) + - (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) + - (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) + - (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) + - (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15); -} - -inline static int32_t vaddvq_s16(int16x8_t v) { - return - (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + - (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + - (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + - (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); -} - -inline static uint32_t vaddvq_u16(uint16x8_t v) { - return - (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) + - (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) + - (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) + - (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7); -} - inline static int32_t vaddvq_s32(int32x4_t v) { return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); } @@ -859,12 +843,6 @@ inline static float vaddvq_f32(float32x4_t v) { return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); } -inline static float vminvq_f32(float32x4_t v) { - return - MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), - MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); -} - inline static float vmaxvq_f32(float32x4_t v) { return MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), @@ -887,42 +865,42 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { #define QK4_0 32 typedef struct { - wsp_ggml_fp16_t d; // delta + ggml_fp16_t d; // delta uint8_t qs[QK4_0 / 2]; // nibbles / quants } block_q4_0; -static_assert(sizeof(block_q4_0) == sizeof(wsp_ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); +static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); #define QK4_1 32 typedef struct { - wsp_ggml_fp16_t d; // delta - wsp_ggml_fp16_t m; // min + ggml_fp16_t d; // delta + ggml_fp16_t m; // min uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; -static_assert(sizeof(block_q4_1) == 2 * sizeof(wsp_ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding"); +static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding"); #define QK5_0 32 typedef struct { - wsp_ggml_fp16_t d; // delta + ggml_fp16_t d; // delta uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_0 / 2]; // nibbles / quants } block_q5_0; -static_assert(sizeof(block_q5_0) == sizeof(wsp_ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); +static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); #define QK5_1 32 typedef struct { - wsp_ggml_fp16_t d; // delta - wsp_ggml_fp16_t m; // min + ggml_fp16_t d; // delta + ggml_fp16_t m; // min uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_1 / 2]; // nibbles / quants } block_q5_1; -static_assert(sizeof(block_q5_1) == 2 * sizeof(wsp_ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); +static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); #define QK8_0 32 typedef struct { - wsp_ggml_fp16_t d; // delta + ggml_fp16_t d; // delta int8_t qs[QK8_0]; // quants } block_q8_0; -static_assert(sizeof(block_q8_0) == sizeof(wsp_ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); +static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); #define QK8_1 32 typedef struct { @@ -955,7 +933,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r const float d = max / -8; const float id = d ? 1.0f/d : 0.0f; - y[i].d = WSP_GGML_FP32_TO_FP16(d); + y[i].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < qk/2; ++j) { const float x0 = x[i*qk + 0 + j]*id; @@ -995,8 +973,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r const float d = (max - min) / ((1 << 4) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = WSP_GGML_FP32_TO_FP16(d); - y[i].m = WSP_GGML_FP32_TO_FP16(min); + y[i].d = GGML_FP32_TO_FP16(d); + y[i].m = GGML_FP32_TO_FP16(min); for (int j = 0; j < qk/2; ++j) { const float x0 = (x[i*qk + 0 + j] - min)*id; @@ -1037,7 +1015,7 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r const float d = max / -16; const float id = d ? 1.0f/d : 0.0f; - y[i].d = WSP_GGML_FP32_TO_FP16(d); + y[i].d = GGML_FP32_TO_FP16(d); uint32_t qh = 0; @@ -1084,8 +1062,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r const float d = (max - min) / ((1 << 5) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = WSP_GGML_FP32_TO_FP16(d); - y[i].m = WSP_GGML_FP32_TO_FP16(min); + y[i].d = GGML_FP32_TO_FP16(d); + y[i].m = GGML_FP32_TO_FP16(min); uint32_t qh = 0; @@ -1127,7 +1105,7 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = WSP_GGML_FP32_TO_FP16(d); + y[i].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < QK8_0; ++j) { const float x0 = x[i*QK8_0 + j]*id; @@ -1162,7 +1140,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = WSP_GGML_FP32_TO_FP16(d); + y[i].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < 8; j++) { const float32x4_t v = vmulq_n_f32(srcv[j], id); @@ -1195,7 +1173,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = WSP_GGML_FP32_TO_FP16(d); + y[i].d = GGML_FP32_TO_FP16(d); for (int j = 0; j < 8; j++) { const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); @@ -1230,7 +1208,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int // Quantize these floats const float d = maxScalar / 127.f; - y[i].d = WSP_GGML_FP32_TO_FP16(d); + y[i].d = GGML_FP32_TO_FP16(d); const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; const __m256 mul = _mm256_set1_ps( id ); @@ -1523,7 +1501,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict const int nb = k / qk; for (int i = 0; i < nb; i++) { - const float d = WSP_GGML_FP16_TO_FP32(x[i].d); + const float d = GGML_FP16_TO_FP32(x[i].d); for (int j = 0; j < qk/2; ++j) { const int x0 = (x[i].qs[j] & 0x0F) - 8; @@ -1543,8 +1521,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict const int nb = k / qk; for (int i = 0; i < nb; i++) { - const float d = WSP_GGML_FP16_TO_FP32(x[i].d); - const float m = WSP_GGML_FP16_TO_FP32(x[i].m); + const float d = GGML_FP16_TO_FP32(x[i].d); + const float m = GGML_FP16_TO_FP32(x[i].m); for (int j = 0; j < qk/2; ++j) { const int x0 = (x[i].qs[j] & 0x0F); @@ -1564,7 +1542,7 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict const int nb = k / qk; for (int i = 0; i < nb; i++) { - const float d = WSP_GGML_FP16_TO_FP32(x[i].d); + const float d = GGML_FP16_TO_FP32(x[i].d); uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); @@ -1590,8 +1568,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict const int nb = k / qk; for (int i = 0; i < nb; i++) { - const float d = WSP_GGML_FP16_TO_FP32(x[i].d); - const float m = WSP_GGML_FP16_TO_FP32(x[i].m); + const float d = GGML_FP16_TO_FP32(x[i].d); + const float m = GGML_FP16_TO_FP32(x[i].m); uint32_t qh; memcpy(&qh, x[i].qh, sizeof(qh)); @@ -1619,7 +1597,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in const block_q8_0 * restrict x = vx; for (int i = 0; i < nb; i++) { - const float d = WSP_GGML_FP16_TO_FP32(x[i].d); + const float d = GGML_FP16_TO_FP32(x[i].d); for (int j = 0; j < qk; ++j) { y[i*qk + j] = x[i].qs[j]*d; @@ -1627,109 +1605,186 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in } } -static void wsp_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void wsp_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void wsp_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void wsp_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static void wsp_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); +static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); +static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static const quantize_fns_t quantize_fns[WSP_GGML_TYPE_COUNT] = { - [WSP_GGML_TYPE_Q4_0] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0, - .quantize_row_q = quantize_row_q4_0, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, - .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = wsp_ggml_vec_dot_q4_0_q8_0, - .vec_dot_type = WSP_GGML_TYPE_Q8_0, +static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { + [GGML_TYPE_I8] = { + .type_name = "i8", + .blck_size = 1, + .type_size = sizeof(int8_t), + .is_quantized = false, + }, + [GGML_TYPE_I16] = { + .type_name = "i16", + .blck_size = 1, + .type_size = sizeof(int16_t), + .is_quantized = false, + }, + [GGML_TYPE_I32] = { + .type_name = "i32", + .blck_size = 1, + .type_size = sizeof(int32_t), + .is_quantized = false, + }, + [GGML_TYPE_F32] = { + .type_name = "f32", + .blck_size = 1, + .type_size = sizeof(float), + .is_quantized = false, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, + .vec_dot_type = GGML_TYPE_F32, }, - [WSP_GGML_TYPE_Q4_1] = { - .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_1, - .quantize_row_q = quantize_row_q4_1, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, - .quantize_row_q_dot = quantize_row_q8_1, - .vec_dot_q = wsp_ggml_vec_dot_q4_1_q8_1, - .vec_dot_type = WSP_GGML_TYPE_Q8_1, + [GGML_TYPE_F16] = { + .type_name = "f16", + .blck_size = 1, + .type_size = sizeof(ggml_fp16_t), + .is_quantized = false, + .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, + .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16, + .vec_dot_type = GGML_TYPE_F16, }, - [WSP_GGML_TYPE_Q5_0] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_0, - .quantize_row_q = quantize_row_q5_0, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_0_reference, - .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = wsp_ggml_vec_dot_q5_0_q8_0, - .vec_dot_type = WSP_GGML_TYPE_Q8_0, + [GGML_TYPE_Q4_0] = { + .type_name = "q4_0", + .blck_size = QK4_0, + .type_size = sizeof(block_q4_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q4_0, + .from_float = quantize_row_q4_0, + .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, + .vec_dot = ggml_vec_dot_q4_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, }, - [WSP_GGML_TYPE_Q5_1] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_1, - .quantize_row_q = quantize_row_q5_1, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference, - .quantize_row_q_dot = quantize_row_q8_1, - .vec_dot_q = wsp_ggml_vec_dot_q5_1_q8_1, - .vec_dot_type = WSP_GGML_TYPE_Q8_1, + [GGML_TYPE_Q4_1] = { + .type_name = "q4_1", + .blck_size = QK4_1, + .type_size = sizeof(block_q4_1), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q4_1, + .from_float = quantize_row_q4_1, + .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, + .vec_dot = ggml_vec_dot_q4_1_q8_1, + .vec_dot_type = GGML_TYPE_Q8_1, }, - [WSP_GGML_TYPE_Q8_0] = { - .dequantize_row_q = dequantize_row_q8_0, - .quantize_row_q = quantize_row_q8_0, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference, - .quantize_row_q_dot = quantize_row_q8_0, - .vec_dot_q = wsp_ggml_vec_dot_q8_0_q8_0, - .vec_dot_type = WSP_GGML_TYPE_Q8_0, + [GGML_TYPE_Q5_0] = { + .type_name = "q5_0", + .blck_size = QK5_0, + .type_size = sizeof(block_q5_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q5_0, + .from_float = quantize_row_q5_0, + .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, + .vec_dot = ggml_vec_dot_q5_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, }, - [WSP_GGML_TYPE_Q8_1] = { - .dequantize_row_q = NULL, // TODO - .quantize_row_q = quantize_row_q8_1, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference, - .quantize_row_q_dot = quantize_row_q8_1, - .vec_dot_q = NULL, // TODO - .vec_dot_type = WSP_GGML_TYPE_Q8_1, + [GGML_TYPE_Q5_1] = { + .type_name = "q5_1", + .blck_size = QK5_1, + .type_size = sizeof(block_q5_1), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q5_1, + .from_float = quantize_row_q5_1, + .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, + .vec_dot = ggml_vec_dot_q5_1_q8_1, + .vec_dot_type = GGML_TYPE_Q8_1, }, -#ifdef WSP_GGML_USE_K_QUANTS - [WSP_GGML_TYPE_Q2_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q2_K, - .quantize_row_q = quantize_row_q2_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = wsp_ggml_vec_dot_q2_K_q8_K, - .vec_dot_type = WSP_GGML_TYPE_Q8_K, + [GGML_TYPE_Q8_0] = { + .type_name = "q8_0", + .blck_size = QK8_0, + .type_size = sizeof(block_q8_0), + .is_quantized = true, + .to_float = dequantize_row_q8_0, + .from_float = quantize_row_q8_0, + .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, + .vec_dot = ggml_vec_dot_q8_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, }, - [WSP_GGML_TYPE_Q3_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q3_K, - .quantize_row_q = quantize_row_q3_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = wsp_ggml_vec_dot_q3_K_q8_K, - .vec_dot_type = WSP_GGML_TYPE_Q8_K, + [GGML_TYPE_Q8_1] = { + .type_name = "q8_1", + .blck_size = QK8_1, + .type_size = sizeof(block_q8_1), + .is_quantized = true, + .from_float = quantize_row_q8_1, + .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, + .vec_dot_type = GGML_TYPE_Q8_1, }, - [WSP_GGML_TYPE_Q4_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_K, - .quantize_row_q = quantize_row_q4_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = wsp_ggml_vec_dot_q4_K_q8_K, - .vec_dot_type = WSP_GGML_TYPE_Q8_K, +#ifdef GGML_USE_K_QUANTS + [GGML_TYPE_Q2_K] = { + .type_name = "q2_K", + .blck_size = QK_K, + .type_size = sizeof(block_q2_K), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q2_K, + .from_float = quantize_row_q2_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, + .vec_dot = ggml_vec_dot_q2_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, }, - [WSP_GGML_TYPE_Q5_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q5_K, - .quantize_row_q = quantize_row_q5_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = wsp_ggml_vec_dot_q5_K_q8_K, - .vec_dot_type = WSP_GGML_TYPE_Q8_K, + [GGML_TYPE_Q3_K] = { + .type_name = "q3_K", + .blck_size = QK_K, + .type_size = sizeof(block_q3_K), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q3_K, + .from_float = quantize_row_q3_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, + .vec_dot = ggml_vec_dot_q3_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, }, - [WSP_GGML_TYPE_Q6_K] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q6_K, - .quantize_row_q = quantize_row_q6_K, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q6_K_reference, - .quantize_row_q_dot = quantize_row_q8_K, - .vec_dot_q = wsp_ggml_vec_dot_q6_K_q8_K, - .vec_dot_type = WSP_GGML_TYPE_Q8_K, + [GGML_TYPE_Q4_K] = { + .type_name = "q4_K", + .blck_size = QK_K, + .type_size = sizeof(block_q4_K), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q4_K, + .from_float = quantize_row_q4_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, + .vec_dot = ggml_vec_dot_q4_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, }, + [GGML_TYPE_Q5_K] = { + .type_name = "q5_K", + .blck_size = QK_K, + .type_size = sizeof(block_q5_K), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q5_K, + .from_float = quantize_row_q5_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, + .vec_dot = ggml_vec_dot_q5_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + }, + [GGML_TYPE_Q6_K] = { + .type_name = "q6_K", + .blck_size = QK_K, + .type_size = sizeof(block_q6_K), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q6_K, + .from_float = quantize_row_q6_K, + .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, + .vec_dot = ggml_vec_dot_q6_K_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + }, + [GGML_TYPE_Q8_K] = { + .type_name = "q8_K", + .blck_size = QK_K, + .type_size = sizeof(block_q8_K), + .is_quantized = true, + .from_float = quantize_row_q8_K, + } #endif }; // For internal test use -quantize_fns_t wsp_ggml_internal_get_quantize_fn(size_t i) { - WSP_GGML_ASSERT(i < WSP_GGML_TYPE_COUNT); - return quantize_fns[i]; +ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { + GGML_ASSERT(type < GGML_TYPE_COUNT); + return type_traits[type]; } @@ -1741,34 +1796,34 @@ quantize_fns_t wsp_ggml_internal_get_quantize_fn(size_t i) { // we then implement the fundamental computation operations below using only these macros // adding support for new architectures requires to define the corresponding SIMD macros // -// WSP_GGML_F32_STEP / WSP_GGML_F16_STEP +// GGML_F32_STEP / GGML_F16_STEP // number of elements to process in a single step // -// WSP_GGML_F32_EPR / WSP_GGML_F16_EPR +// GGML_F32_EPR / GGML_F16_EPR // number of elements to fit in a single register // #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) -#define WSP_GGML_SIMD +#define GGML_SIMD // F32 NEON -#define WSP_GGML_F32_STEP 16 -#define WSP_GGML_F32_EPR 4 - -#define WSP_GGML_F32x4 float32x4_t -#define WSP_GGML_F32x4_ZERO vdupq_n_f32(0.0f) -#define WSP_GGML_F32x4_SET1(x) vdupq_n_f32(x) -#define WSP_GGML_F32x4_LOAD vld1q_f32 -#define WSP_GGML_F32x4_STORE vst1q_f32 -#define WSP_GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) -#define WSP_GGML_F32x4_ADD vaddq_f32 -#define WSP_GGML_F32x4_MUL vmulq_f32 -#define WSP_GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) -#define WSP_GGML_F32x4_REDUCE(res, x) \ +#define GGML_F32_STEP 16 +#define GGML_F32_EPR 4 + +#define GGML_F32x4 float32x4_t +#define GGML_F32x4_ZERO vdupq_n_f32(0.0f) +#define GGML_F32x4_SET1(x) vdupq_n_f32(x) +#define GGML_F32x4_LOAD vld1q_f32 +#define GGML_F32x4_STORE vst1q_f32 +#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) +#define GGML_F32x4_ADD vaddq_f32 +#define GGML_F32x4_MUL vmulq_f32 +#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) +#define GGML_F32x4_REDUCE(res, x) \ { \ - int offset = WSP_GGML_F32_ARR >> 1; \ + int offset = GGML_F32_ARR >> 1; \ for (int i = 0; i < offset; ++i) { \ x[i] = vaddq_f32(x[i], x[offset+i]); \ } \ @@ -1780,36 +1835,36 @@ quantize_fns_t wsp_ggml_internal_get_quantize_fn(size_t i) { for (int i = 0; i < offset; ++i) { \ x[i] = vaddq_f32(x[i], x[offset+i]); \ } \ - res = WSP_GGML_F32x4_REDUCE_ONE(x[0]); \ + res = GGML_F32x4_REDUCE_ONE(x[0]); \ } -#define WSP_GGML_F32_VEC WSP_GGML_F32x4 -#define WSP_GGML_F32_VEC_ZERO WSP_GGML_F32x4_ZERO -#define WSP_GGML_F32_VEC_SET1 WSP_GGML_F32x4_SET1 -#define WSP_GGML_F32_VEC_LOAD WSP_GGML_F32x4_LOAD -#define WSP_GGML_F32_VEC_STORE WSP_GGML_F32x4_STORE -#define WSP_GGML_F32_VEC_FMA WSP_GGML_F32x4_FMA -#define WSP_GGML_F32_VEC_ADD WSP_GGML_F32x4_ADD -#define WSP_GGML_F32_VEC_MUL WSP_GGML_F32x4_MUL -#define WSP_GGML_F32_VEC_REDUCE WSP_GGML_F32x4_REDUCE +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE // F16 NEON #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - #define WSP_GGML_F16_STEP 32 - #define WSP_GGML_F16_EPR 8 - - #define WSP_GGML_F16x8 float16x8_t - #define WSP_GGML_F16x8_ZERO vdupq_n_f16(0.0f) - #define WSP_GGML_F16x8_SET1(x) vdupq_n_f16(x) - #define WSP_GGML_F16x8_LOAD vld1q_f16 - #define WSP_GGML_F16x8_STORE vst1q_f16 - #define WSP_GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) - #define WSP_GGML_F16x8_ADD vaddq_f16 - #define WSP_GGML_F16x8_MUL vmulq_f16 - #define WSP_GGML_F16x8_REDUCE(res, x) \ + #define GGML_F16_STEP 32 + #define GGML_F16_EPR 8 + + #define GGML_F16x8 float16x8_t + #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) + #define GGML_F16x8_SET1(x) vdupq_n_f16(x) + #define GGML_F16x8_LOAD vld1q_f16 + #define GGML_F16x8_STORE vst1q_f16 + #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) + #define GGML_F16x8_ADD vaddq_f16 + #define GGML_F16x8_MUL vmulq_f16 + #define GGML_F16x8_REDUCE(res, x) \ { \ - int offset = WSP_GGML_F16_ARR >> 1; \ + int offset = GGML_F16_ARR >> 1; \ for (int i = 0; i < offset; ++i) { \ x[i] = vaddq_f16(x[i], x[offset+i]); \ } \ @@ -1823,70 +1878,70 @@ quantize_fns_t wsp_ggml_internal_get_quantize_fn(size_t i) { } \ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ - res = (wsp_ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ - } - - #define WSP_GGML_F16_VEC WSP_GGML_F16x8 - #define WSP_GGML_F16_VEC_ZERO WSP_GGML_F16x8_ZERO - #define WSP_GGML_F16_VEC_SET1 WSP_GGML_F16x8_SET1 - #define WSP_GGML_F16_VEC_LOAD(p, i) WSP_GGML_F16x8_LOAD(p) - #define WSP_GGML_F16_VEC_STORE(p, r, i) WSP_GGML_F16x8_STORE(p, r[i]) - #define WSP_GGML_F16_VEC_FMA WSP_GGML_F16x8_FMA - #define WSP_GGML_F16_VEC_ADD WSP_GGML_F16x8_ADD - #define WSP_GGML_F16_VEC_MUL WSP_GGML_F16x8_MUL - #define WSP_GGML_F16_VEC_REDUCE WSP_GGML_F16x8_REDUCE + res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ + } + + #define GGML_F16_VEC GGML_F16x8 + #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO + #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 + #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) + #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i]) + #define GGML_F16_VEC_FMA GGML_F16x8_FMA + #define GGML_F16_VEC_ADD GGML_F16x8_ADD + #define GGML_F16_VEC_MUL GGML_F16x8_MUL + #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE #else // if FP16 vector arithmetic is not supported, we use FP32 instead // and take advantage of the vcvt_ functions to convert to/from FP16 - #define WSP_GGML_F16_STEP 16 - #define WSP_GGML_F16_EPR 4 - - #define WSP_GGML_F32Cx4 float32x4_t - #define WSP_GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) - #define WSP_GGML_F32Cx4_SET1(x) vdupq_n_f32(x) - #define WSP_GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) - #define WSP_GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) - #define WSP_GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) - #define WSP_GGML_F32Cx4_ADD vaddq_f32 - #define WSP_GGML_F32Cx4_MUL vmulq_f32 - #define WSP_GGML_F32Cx4_REDUCE WSP_GGML_F32x4_REDUCE - - #define WSP_GGML_F16_VEC WSP_GGML_F32Cx4 - #define WSP_GGML_F16_VEC_ZERO WSP_GGML_F32Cx4_ZERO - #define WSP_GGML_F16_VEC_SET1 WSP_GGML_F32Cx4_SET1 - #define WSP_GGML_F16_VEC_LOAD(p, i) WSP_GGML_F32Cx4_LOAD(p) - #define WSP_GGML_F16_VEC_STORE(p, r, i) WSP_GGML_F32Cx4_STORE(p, r[i]) - #define WSP_GGML_F16_VEC_FMA WSP_GGML_F32Cx4_FMA - #define WSP_GGML_F16_VEC_ADD WSP_GGML_F32Cx4_ADD - #define WSP_GGML_F16_VEC_MUL WSP_GGML_F32Cx4_MUL - #define WSP_GGML_F16_VEC_REDUCE WSP_GGML_F32Cx4_REDUCE + #define GGML_F16_STEP 16 + #define GGML_F16_EPR 4 + + #define GGML_F32Cx4 float32x4_t + #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) + #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) + #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16(x)) + #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) + #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) + #define GGML_F32Cx4_ADD vaddq_f32 + #define GGML_F32Cx4_MUL vmulq_f32 + #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE + + #define GGML_F16_VEC GGML_F32Cx4 + #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO + #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 + #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) + #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) + #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA + #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD + #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL + #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE #endif #elif defined(__AVX__) -#define WSP_GGML_SIMD +#define GGML_SIMD // F32 AVX -#define WSP_GGML_F32_STEP 32 -#define WSP_GGML_F32_EPR 8 +#define GGML_F32_STEP 32 +#define GGML_F32_EPR 8 -#define WSP_GGML_F32x8 __m256 -#define WSP_GGML_F32x8_ZERO _mm256_setzero_ps() -#define WSP_GGML_F32x8_SET1(x) _mm256_set1_ps(x) -#define WSP_GGML_F32x8_LOAD _mm256_loadu_ps -#define WSP_GGML_F32x8_STORE _mm256_storeu_ps +#define GGML_F32x8 __m256 +#define GGML_F32x8_ZERO _mm256_setzero_ps() +#define GGML_F32x8_SET1(x) _mm256_set1_ps(x) +#define GGML_F32x8_LOAD _mm256_loadu_ps +#define GGML_F32x8_STORE _mm256_storeu_ps #if defined(__FMA__) - #define WSP_GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a) + #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a) #else - #define WSP_GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a) + #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a) #endif -#define WSP_GGML_F32x8_ADD _mm256_add_ps -#define WSP_GGML_F32x8_MUL _mm256_mul_ps -#define WSP_GGML_F32x8_REDUCE(res, x) \ +#define GGML_F32x8_ADD _mm256_add_ps +#define GGML_F32x8_MUL _mm256_mul_ps +#define GGML_F32x8_REDUCE(res, x) \ { \ - int offset = WSP_GGML_F32_ARR >> 1; \ + int offset = GGML_F32_ARR >> 1; \ for (int i = 0; i < offset; ++i) { \ x[i] = _mm256_add_ps(x[i], x[offset+i]); \ } \ @@ -1905,88 +1960,88 @@ quantize_fns_t wsp_ggml_internal_get_quantize_fn(size_t i) { } // TODO: is this optimal ? -#define WSP_GGML_F32_VEC WSP_GGML_F32x8 -#define WSP_GGML_F32_VEC_ZERO WSP_GGML_F32x8_ZERO -#define WSP_GGML_F32_VEC_SET1 WSP_GGML_F32x8_SET1 -#define WSP_GGML_F32_VEC_LOAD WSP_GGML_F32x8_LOAD -#define WSP_GGML_F32_VEC_STORE WSP_GGML_F32x8_STORE -#define WSP_GGML_F32_VEC_FMA WSP_GGML_F32x8_FMA -#define WSP_GGML_F32_VEC_ADD WSP_GGML_F32x8_ADD -#define WSP_GGML_F32_VEC_MUL WSP_GGML_F32x8_MUL -#define WSP_GGML_F32_VEC_REDUCE WSP_GGML_F32x8_REDUCE +#define GGML_F32_VEC GGML_F32x8 +#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x8_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD +#define GGML_F32_VEC_STORE GGML_F32x8_STORE +#define GGML_F32_VEC_FMA GGML_F32x8_FMA +#define GGML_F32_VEC_ADD GGML_F32x8_ADD +#define GGML_F32_VEC_MUL GGML_F32x8_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE // F16 AVX -#define WSP_GGML_F16_STEP 32 -#define WSP_GGML_F16_EPR 8 +#define GGML_F16_STEP 32 +#define GGML_F16_EPR 8 // F16 arithmetic is not supported by AVX, so we use F32 instead -#define WSP_GGML_F32Cx8 __m256 -#define WSP_GGML_F32Cx8_ZERO _mm256_setzero_ps() -#define WSP_GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) +#define GGML_F32Cx8 __m256 +#define GGML_F32Cx8_ZERO _mm256_setzero_ps() +#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) #if defined(__F16C__) // the _mm256_cvt intrinsics require F16C -#define WSP_GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) -#define WSP_GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) +#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) +#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) #else -static inline __m256 __avx_f32cx8_load(wsp_ggml_fp16_t *x) { +static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { float tmp[8]; for (int i = 0; i < 8; i++) { - tmp[i] = WSP_GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_FP16_TO_FP32(x[i]); } return _mm256_loadu_ps(tmp); } -static inline void __avx_f32cx8_store(wsp_ggml_fp16_t *x, __m256 y) { +static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { float arr[8]; _mm256_storeu_ps(arr, y); for (int i = 0; i < 8; i++) - x[i] = WSP_GGML_FP32_TO_FP16(arr[i]); + x[i] = GGML_FP32_TO_FP16(arr[i]); } -#define WSP_GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) -#define WSP_GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) +#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) +#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) #endif -#define WSP_GGML_F32Cx8_FMA WSP_GGML_F32x8_FMA -#define WSP_GGML_F32Cx8_ADD _mm256_add_ps -#define WSP_GGML_F32Cx8_MUL _mm256_mul_ps -#define WSP_GGML_F32Cx8_REDUCE WSP_GGML_F32x8_REDUCE - -#define WSP_GGML_F16_VEC WSP_GGML_F32Cx8 -#define WSP_GGML_F16_VEC_ZERO WSP_GGML_F32Cx8_ZERO -#define WSP_GGML_F16_VEC_SET1 WSP_GGML_F32Cx8_SET1 -#define WSP_GGML_F16_VEC_LOAD(p, i) WSP_GGML_F32Cx8_LOAD(p) -#define WSP_GGML_F16_VEC_STORE(p, r, i) WSP_GGML_F32Cx8_STORE(p, r[i]) -#define WSP_GGML_F16_VEC_FMA WSP_GGML_F32Cx8_FMA -#define WSP_GGML_F16_VEC_ADD WSP_GGML_F32Cx8_ADD -#define WSP_GGML_F16_VEC_MUL WSP_GGML_F32Cx8_MUL -#define WSP_GGML_F16_VEC_REDUCE WSP_GGML_F32Cx8_REDUCE +#define GGML_F32Cx8_FMA GGML_F32x8_FMA +#define GGML_F32Cx8_ADD _mm256_add_ps +#define GGML_F32Cx8_MUL _mm256_mul_ps +#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE + +#define GGML_F16_VEC GGML_F32Cx8 +#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE #elif defined(__POWER9_VECTOR__) -#define WSP_GGML_SIMD +#define GGML_SIMD // F32 POWER9 -#define WSP_GGML_F32_STEP 32 -#define WSP_GGML_F32_EPR 4 - -#define WSP_GGML_F32x4 vector float -#define WSP_GGML_F32x4_ZERO 0.0f -#define WSP_GGML_F32x4_SET1 vec_splats -#define WSP_GGML_F32x4_LOAD(p) vec_xl(0, p) -#define WSP_GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) -#define WSP_GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) -#define WSP_GGML_F32x4_ADD vec_add -#define WSP_GGML_F32x4_MUL vec_mul -#define WSP_GGML_F32x4_REDUCE(res, x) \ +#define GGML_F32_STEP 32 +#define GGML_F32_EPR 4 + +#define GGML_F32x4 vector float +#define GGML_F32x4_ZERO 0.0f +#define GGML_F32x4_SET1 vec_splats +#define GGML_F32x4_LOAD(p) vec_xl(0, p) +#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p) +#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a) +#define GGML_F32x4_ADD vec_add +#define GGML_F32x4_MUL vec_mul +#define GGML_F32x4_REDUCE(res, x) \ { \ - int offset = WSP_GGML_F32_ARR >> 1; \ + int offset = GGML_F32_ARR >> 1; \ for (int i = 0; i < offset; ++i) { \ x[i] = vec_add(x[i], x[offset+i]); \ } \ @@ -2004,55 +2059,55 @@ static inline void __avx_f32cx8_store(wsp_ggml_fp16_t *x, __m256 y) { vec_extract(x[0], 3); \ } -#define WSP_GGML_F32_VEC WSP_GGML_F32x4 -#define WSP_GGML_F32_VEC_ZERO WSP_GGML_F32x4_ZERO -#define WSP_GGML_F32_VEC_SET1 WSP_GGML_F32x4_SET1 -#define WSP_GGML_F32_VEC_LOAD WSP_GGML_F32x4_LOAD -#define WSP_GGML_F32_VEC_STORE WSP_GGML_F32x4_STORE -#define WSP_GGML_F32_VEC_FMA WSP_GGML_F32x4_FMA -#define WSP_GGML_F32_VEC_ADD WSP_GGML_F32x4_ADD -#define WSP_GGML_F32_VEC_MUL WSP_GGML_F32x4_MUL -#define WSP_GGML_F32_VEC_REDUCE WSP_GGML_F32x4_REDUCE +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE // F16 POWER9 -#define WSP_GGML_F16_STEP WSP_GGML_F32_STEP -#define WSP_GGML_F16_EPR WSP_GGML_F32_EPR -#define WSP_GGML_F16_VEC WSP_GGML_F32x4 -#define WSP_GGML_F16_VEC_ZERO WSP_GGML_F32x4_ZERO -#define WSP_GGML_F16_VEC_SET1 WSP_GGML_F32x4_SET1 -#define WSP_GGML_F16_VEC_FMA WSP_GGML_F32x4_FMA -#define WSP_GGML_F16_VEC_REDUCE WSP_GGML_F32x4_REDUCE +#define GGML_F16_STEP GGML_F32_STEP +#define GGML_F16_EPR GGML_F32_EPR +#define GGML_F16_VEC GGML_F32x4 +#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F16_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F16_VEC_FMA GGML_F32x4_FMA +#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE // Use vec_xl, not vec_ld, in case the load address is not aligned. -#define WSP_GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ - vec_extract_fp32_from_shorth(vec_xl(0, p - WSP_GGML_F16_EPR)) : \ +#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \ + vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \ vec_extract_fp32_from_shortl(vec_xl(0, p)) -#define WSP_GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i] -#define WSP_GGML_F16_VEC_STORE(p, r, i) \ +#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i] +#define GGML_F16_VEC_STORE(p, r, i) \ if (i & 0x1) \ - vec_xst(vec_pack_to_short_fp32(r[i - WSP_GGML_ENDIAN_BYTE(1)], \ - r[i - WSP_GGML_ENDIAN_BYTE(0)]), \ - 0, p - WSP_GGML_F16_EPR) + vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \ + r[i - GGML_ENDIAN_BYTE(0)]), \ + 0, p - GGML_F16_EPR) #elif defined(__wasm_simd128__) -#define WSP_GGML_SIMD +#define GGML_SIMD // F32 WASM -#define WSP_GGML_F32_STEP 16 -#define WSP_GGML_F32_EPR 4 - -#define WSP_GGML_F32x4 v128_t -#define WSP_GGML_F32x4_ZERO wasm_f32x4_splat(0.0f) -#define WSP_GGML_F32x4_SET1(x) wasm_f32x4_splat(x) -#define WSP_GGML_F32x4_LOAD wasm_v128_load -#define WSP_GGML_F32x4_STORE wasm_v128_store -#define WSP_GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) -#define WSP_GGML_F32x4_ADD wasm_f32x4_add -#define WSP_GGML_F32x4_MUL wasm_f32x4_mul -#define WSP_GGML_F32x4_REDUCE(res, x) \ +#define GGML_F32_STEP 16 +#define GGML_F32_EPR 4 + +#define GGML_F32x4 v128_t +#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f) +#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x) +#define GGML_F32x4_LOAD wasm_v128_load +#define GGML_F32x4_STORE wasm_v128_store +#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a) +#define GGML_F32x4_ADD wasm_f32x4_add +#define GGML_F32x4_MUL wasm_f32x4_mul +#define GGML_F32x4_REDUCE(res, x) \ { \ - int offset = WSP_GGML_F32_ARR >> 1; \ + int offset = GGML_F32_ARR >> 1; \ for (int i = 0; i < offset; ++i) { \ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ } \ @@ -2070,54 +2125,54 @@ static inline void __avx_f32cx8_store(wsp_ggml_fp16_t *x, __m256 y) { wasm_f32x4_extract_lane(x[0], 3); \ } -#define WSP_GGML_F32_VEC WSP_GGML_F32x4 -#define WSP_GGML_F32_VEC_ZERO WSP_GGML_F32x4_ZERO -#define WSP_GGML_F32_VEC_SET1 WSP_GGML_F32x4_SET1 -#define WSP_GGML_F32_VEC_LOAD WSP_GGML_F32x4_LOAD -#define WSP_GGML_F32_VEC_STORE WSP_GGML_F32x4_STORE -#define WSP_GGML_F32_VEC_FMA WSP_GGML_F32x4_FMA -#define WSP_GGML_F32_VEC_ADD WSP_GGML_F32x4_ADD -#define WSP_GGML_F32_VEC_MUL WSP_GGML_F32x4_MUL -#define WSP_GGML_F32_VEC_REDUCE WSP_GGML_F32x4_REDUCE +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE // F16 WASM -#define WSP_GGML_F16_STEP 16 -#define WSP_GGML_F16_EPR 4 +#define GGML_F16_STEP 16 +#define GGML_F16_EPR 4 -inline static v128_t __wasm_f16x4_load(const wsp_ggml_fp16_t * p) { +inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) { float tmp[4]; - tmp[0] = WSP_GGML_FP16_TO_FP32(p[0]); - tmp[1] = WSP_GGML_FP16_TO_FP32(p[1]); - tmp[2] = WSP_GGML_FP16_TO_FP32(p[2]); - tmp[3] = WSP_GGML_FP16_TO_FP32(p[3]); + tmp[0] = GGML_FP16_TO_FP32(p[0]); + tmp[1] = GGML_FP16_TO_FP32(p[1]); + tmp[2] = GGML_FP16_TO_FP32(p[2]); + tmp[3] = GGML_FP16_TO_FP32(p[3]); return wasm_v128_load(tmp); } -inline static void __wasm_f16x4_store(wsp_ggml_fp16_t * p, v128_t x) { +inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { float tmp[4]; wasm_v128_store(tmp, x); - p[0] = WSP_GGML_FP32_TO_FP16(tmp[0]); - p[1] = WSP_GGML_FP32_TO_FP16(tmp[1]); - p[2] = WSP_GGML_FP32_TO_FP16(tmp[2]); - p[3] = WSP_GGML_FP32_TO_FP16(tmp[3]); -} - -#define WSP_GGML_F16x4 v128_t -#define WSP_GGML_F16x4_ZERO wasm_f32x4_splat(0.0f) -#define WSP_GGML_F16x4_SET1(x) wasm_f32x4_splat(x) -#define WSP_GGML_F16x4_LOAD(x) __wasm_f16x4_load(x) -#define WSP_GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y) -#define WSP_GGML_F16x4_FMA WSP_GGML_F32x4_FMA -#define WSP_GGML_F16x4_ADD wasm_f32x4_add -#define WSP_GGML_F16x4_MUL wasm_f32x4_mul -#define WSP_GGML_F16x4_REDUCE(res, x) \ + p[0] = GGML_FP32_TO_FP16(tmp[0]); + p[1] = GGML_FP32_TO_FP16(tmp[1]); + p[2] = GGML_FP32_TO_FP16(tmp[2]); + p[3] = GGML_FP32_TO_FP16(tmp[3]); +} + +#define GGML_F16x4 v128_t +#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f) +#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x) +#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x) +#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y) +#define GGML_F16x4_FMA GGML_F32x4_FMA +#define GGML_F16x4_ADD wasm_f32x4_add +#define GGML_F16x4_MUL wasm_f32x4_mul +#define GGML_F16x4_REDUCE(res, x) \ { \ - int offset = WSP_GGML_F16_ARR >> 1; \ + int offset = GGML_F16_ARR >> 1; \ for (int i = 0; i < offset; ++i) { \ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ } \ @@ -2135,41 +2190,41 @@ inline static void __wasm_f16x4_store(wsp_ggml_fp16_t * p, v128_t x) { wasm_f32x4_extract_lane(x[0], 3); \ } -#define WSP_GGML_F16_VEC WSP_GGML_F16x4 -#define WSP_GGML_F16_VEC_ZERO WSP_GGML_F16x4_ZERO -#define WSP_GGML_F16_VEC_SET1 WSP_GGML_F16x4_SET1 -#define WSP_GGML_F16_VEC_LOAD(p, i) WSP_GGML_F16x4_LOAD(p) -#define WSP_GGML_F16_VEC_STORE(p, r, i) WSP_GGML_F16x4_STORE(p, r[i]) -#define WSP_GGML_F16_VEC_FMA WSP_GGML_F16x4_FMA -#define WSP_GGML_F16_VEC_ADD WSP_GGML_F16x4_ADD -#define WSP_GGML_F16_VEC_MUL WSP_GGML_F16x4_MUL -#define WSP_GGML_F16_VEC_REDUCE WSP_GGML_F16x4_REDUCE +#define GGML_F16_VEC GGML_F16x4 +#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO +#define GGML_F16_VEC_SET1 GGML_F16x4_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F16x4_FMA +#define GGML_F16_VEC_ADD GGML_F16x4_ADD +#define GGML_F16_VEC_MUL GGML_F16x4_MUL +#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE #elif defined(__SSE3__) -#define WSP_GGML_SIMD +#define GGML_SIMD // F32 SSE -#define WSP_GGML_F32_STEP 32 -#define WSP_GGML_F32_EPR 4 +#define GGML_F32_STEP 32 +#define GGML_F32_EPR 4 -#define WSP_GGML_F32x4 __m128 -#define WSP_GGML_F32x4_ZERO _mm_setzero_ps() -#define WSP_GGML_F32x4_SET1(x) _mm_set1_ps(x) -#define WSP_GGML_F32x4_LOAD _mm_loadu_ps -#define WSP_GGML_F32x4_STORE _mm_storeu_ps +#define GGML_F32x4 __m128 +#define GGML_F32x4_ZERO _mm_setzero_ps() +#define GGML_F32x4_SET1(x) _mm_set1_ps(x) +#define GGML_F32x4_LOAD _mm_loadu_ps +#define GGML_F32x4_STORE _mm_storeu_ps #if defined(__FMA__) // TODO: Does this work? - #define WSP_GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) + #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a) #else - #define WSP_GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) + #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a) #endif -#define WSP_GGML_F32x4_ADD _mm_add_ps -#define WSP_GGML_F32x4_MUL _mm_mul_ps -#define WSP_GGML_F32x4_REDUCE(res, x) \ +#define GGML_F32x4_ADD _mm_add_ps +#define GGML_F32x4_MUL _mm_mul_ps +#define GGML_F32x4_REDUCE(res, x) \ { \ - int offset = WSP_GGML_F32_ARR >> 1; \ + int offset = GGML_F32_ARR >> 1; \ for (int i = 0; i < offset; ++i) { \ x[i] = _mm_add_ps(x[i], x[offset+i]); \ } \ @@ -2186,116 +2241,116 @@ inline static void __wasm_f16x4_store(wsp_ggml_fp16_t * p, v128_t x) { } // TODO: is this optimal ? -#define WSP_GGML_F32_VEC WSP_GGML_F32x4 -#define WSP_GGML_F32_VEC_ZERO WSP_GGML_F32x4_ZERO -#define WSP_GGML_F32_VEC_SET1 WSP_GGML_F32x4_SET1 -#define WSP_GGML_F32_VEC_LOAD WSP_GGML_F32x4_LOAD -#define WSP_GGML_F32_VEC_STORE WSP_GGML_F32x4_STORE -#define WSP_GGML_F32_VEC_FMA WSP_GGML_F32x4_FMA -#define WSP_GGML_F32_VEC_ADD WSP_GGML_F32x4_ADD -#define WSP_GGML_F32_VEC_MUL WSP_GGML_F32x4_MUL -#define WSP_GGML_F32_VEC_REDUCE WSP_GGML_F32x4_REDUCE +#define GGML_F32_VEC GGML_F32x4 +#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO +#define GGML_F32_VEC_SET1 GGML_F32x4_SET1 +#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD +#define GGML_F32_VEC_STORE GGML_F32x4_STORE +#define GGML_F32_VEC_FMA GGML_F32x4_FMA +#define GGML_F32_VEC_ADD GGML_F32x4_ADD +#define GGML_F32_VEC_MUL GGML_F32x4_MUL +#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE // F16 SSE -#define WSP_GGML_F16_STEP 32 -#define WSP_GGML_F16_EPR 4 +#define GGML_F16_STEP 32 +#define GGML_F16_EPR 4 -static inline __m128 __sse_f16x4_load(wsp_ggml_fp16_t *x) { +static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) { float tmp[4]; - tmp[0] = WSP_GGML_FP16_TO_FP32(x[0]); - tmp[1] = WSP_GGML_FP16_TO_FP32(x[1]); - tmp[2] = WSP_GGML_FP16_TO_FP32(x[2]); - tmp[3] = WSP_GGML_FP16_TO_FP32(x[3]); + tmp[0] = GGML_FP16_TO_FP32(x[0]); + tmp[1] = GGML_FP16_TO_FP32(x[1]); + tmp[2] = GGML_FP16_TO_FP32(x[2]); + tmp[3] = GGML_FP16_TO_FP32(x[3]); return _mm_loadu_ps(tmp); } -static inline void __sse_f16x4_store(wsp_ggml_fp16_t *x, __m128 y) { +static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) { float arr[4]; _mm_storeu_ps(arr, y); - x[0] = WSP_GGML_FP32_TO_FP16(arr[0]); - x[1] = WSP_GGML_FP32_TO_FP16(arr[1]); - x[2] = WSP_GGML_FP32_TO_FP16(arr[2]); - x[3] = WSP_GGML_FP32_TO_FP16(arr[3]); -} - -#define WSP_GGML_F32Cx4 __m128 -#define WSP_GGML_F32Cx4_ZERO _mm_setzero_ps() -#define WSP_GGML_F32Cx4_SET1(x) _mm_set1_ps(x) -#define WSP_GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x) -#define WSP_GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) -#define WSP_GGML_F32Cx4_FMA WSP_GGML_F32x4_FMA -#define WSP_GGML_F32Cx4_ADD _mm_add_ps -#define WSP_GGML_F32Cx4_MUL _mm_mul_ps -#define WSP_GGML_F32Cx4_REDUCE WSP_GGML_F32x4_REDUCE - -#define WSP_GGML_F16_VEC WSP_GGML_F32Cx4 -#define WSP_GGML_F16_VEC_ZERO WSP_GGML_F32Cx4_ZERO -#define WSP_GGML_F16_VEC_SET1 WSP_GGML_F32Cx4_SET1 -#define WSP_GGML_F16_VEC_LOAD(p, i) WSP_GGML_F32Cx4_LOAD(p) -#define WSP_GGML_F16_VEC_STORE(p, r, i) WSP_GGML_F32Cx4_STORE(p, r[i]) -#define WSP_GGML_F16_VEC_FMA WSP_GGML_F32Cx4_FMA -#define WSP_GGML_F16_VEC_ADD WSP_GGML_F32Cx4_ADD -#define WSP_GGML_F16_VEC_MUL WSP_GGML_F32Cx4_MUL -#define WSP_GGML_F16_VEC_REDUCE WSP_GGML_F32Cx4_REDUCE + x[0] = GGML_FP32_TO_FP16(arr[0]); + x[1] = GGML_FP32_TO_FP16(arr[1]); + x[2] = GGML_FP32_TO_FP16(arr[2]); + x[3] = GGML_FP32_TO_FP16(arr[3]); +} + +#define GGML_F32Cx4 __m128 +#define GGML_F32Cx4_ZERO _mm_setzero_ps() +#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x) +#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x) +#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y) +#define GGML_F32Cx4_FMA GGML_F32x4_FMA +#define GGML_F32Cx4_ADD _mm_add_ps +#define GGML_F32Cx4_MUL _mm_mul_ps +#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE + +#define GGML_F16_VEC GGML_F32Cx4 +#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO +#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 +#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) +#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) +#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA +#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD +#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL +#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE #endif -// WSP_GGML_F32_ARR / WSP_GGML_F16_ARR +// GGML_F32_ARR / GGML_F16_ARR // number of registers to use per step -#ifdef WSP_GGML_SIMD -#define WSP_GGML_F32_ARR (WSP_GGML_F32_STEP/WSP_GGML_F32_EPR) -#define WSP_GGML_F16_ARR (WSP_GGML_F16_STEP/WSP_GGML_F16_EPR) +#ifdef GGML_SIMD +#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR) +#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) #endif // // fundamental operations // -inline static void wsp_ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void wsp_ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void wsp_ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void wsp_ggml_vec_set_f16(const int n, wsp_ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void wsp_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } -inline static void wsp_ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } -inline static void wsp_ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } -inline static void wsp_ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } -inline static void wsp_ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } -inline static void wsp_ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void wsp_ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } -inline static void wsp_ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } -inline static void wsp_ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } -inline static void wsp_ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } +inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } +inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } +inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } +inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; } +inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } +inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } +inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } +inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } +inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } -inline static void wsp_ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { -#ifdef WSP_GGML_SIMD +static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { +#ifdef GGML_SIMD float sumf = 0.0f; - const int np = (n & ~(WSP_GGML_F32_STEP - 1)); + const int np = (n & ~(GGML_F32_STEP - 1)); - WSP_GGML_F32_VEC sum[WSP_GGML_F32_ARR] = { WSP_GGML_F32_VEC_ZERO }; + GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; - WSP_GGML_F32_VEC ax[WSP_GGML_F32_ARR]; - WSP_GGML_F32_VEC ay[WSP_GGML_F32_ARR]; + GGML_F32_VEC ax[GGML_F32_ARR]; + GGML_F32_VEC ay[GGML_F32_ARR]; - for (int i = 0; i < np; i += WSP_GGML_F32_STEP) { - for (int j = 0; j < WSP_GGML_F32_ARR; j++) { - ax[j] = WSP_GGML_F32_VEC_LOAD(x + i + j*WSP_GGML_F32_EPR); - ay[j] = WSP_GGML_F32_VEC_LOAD(y + i + j*WSP_GGML_F32_EPR); + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); - sum[j] = WSP_GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]); + sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]); } } // reduce sum0..sum3 to sum0 - WSP_GGML_F32_VEC_REDUCE(sumf, sum); + GGML_F32_VEC_REDUCE(sumf, sum); // leftovers for (int i = np; i < n; ++i) { @@ -2303,57 +2358,56 @@ inline static void wsp_ggml_vec_dot_f32(const int n, float * restrict s, const f } #else // scalar - wsp_ggml_float sumf = 0.0; + ggml_float sumf = 0.0; for (int i = 0; i < n; ++i) { - sumf += (wsp_ggml_float)(x[i]*y[i]); + sumf += (ggml_float)(x[i]*y[i]); } #endif *s = sumf; } -inline static void wsp_ggml_vec_dot_f16(const int n, float * restrict s, wsp_ggml_fp16_t * restrict x, wsp_ggml_fp16_t * restrict y) { - wsp_ggml_float sumf = 0.0; +static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { + ggml_float sumf = 0.0; -#if defined(WSP_GGML_SIMD) - const int np = (n & ~(WSP_GGML_F16_STEP - 1)); +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F16_STEP - 1)); - WSP_GGML_F16_VEC sum[WSP_GGML_F16_ARR] = { WSP_GGML_F16_VEC_ZERO }; + GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO }; - WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR]; - WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR]; + GGML_F16_VEC ax[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; - for (int i = 0; i < np; i += WSP_GGML_F16_STEP) { - for (int j = 0; j < WSP_GGML_F16_ARR; j++) { - ax[j] = WSP_GGML_F16_VEC_LOAD(x + i + j*WSP_GGML_F16_EPR, j); - ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j); + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - sum[j] = WSP_GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); + sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); } } // reduce sum0..sum3 to sum0 - WSP_GGML_F16_VEC_REDUCE(sumf, sum); + GGML_F16_VEC_REDUCE(sumf, sum); // leftovers for (int i = np; i < n; ++i) { - sumf += (wsp_ggml_float)(WSP_GGML_FP16_TO_FP32(x[i])*WSP_GGML_FP16_TO_FP32(y[i])); + sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); } #else for (int i = 0; i < n; ++i) { - sumf += (wsp_ggml_float)(WSP_GGML_FP16_TO_FP32(x[i])*WSP_GGML_FP16_TO_FP32(y[i])); + sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); } #endif *s = sumf; } -static void wsp_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); const block_q4_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -2362,6 +2416,7 @@ static void wsp_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const vo float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q4_0 * restrict x0 = &x[i + 0]; const block_q4_0 * restrict x1 = &x[i + 1]; @@ -2397,8 +2452,8 @@ static void wsp_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const vo const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), WSP_GGML_FP16_TO_FP32(x0->d)*WSP_GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), WSP_GGML_FP16_TO_FP32(x1->d)*WSP_GGML_FP16_TO_FP32(y1->d)); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l)); @@ -2415,8 +2470,8 @@ static void wsp_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const vo const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), WSP_GGML_FP16_TO_FP32(x0->d)*WSP_GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), WSP_GGML_FP16_TO_FP32(x1->d)*WSP_GGML_FP16_TO_FP32(y1->d)); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #endif } @@ -2428,7 +2483,7 @@ static void wsp_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const vo // Main loop for (int i = 0; i < nb; ++i) { /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps( WSP_GGML_FP16_TO_FP32(x[i].d) * WSP_GGML_FP16_TO_FP32(y[i].d) ); + const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); __m256i bx = bytes_from_nibbles_32(x[i].qs); @@ -2452,7 +2507,7 @@ static void wsp_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const vo // Main loop for (int i = 0; i < nb; ++i) { // Compute combined scale for the block - const __m256 d = _mm256_set1_ps( WSP_GGML_FP16_TO_FP32(x[i].d) * WSP_GGML_FP16_TO_FP32(y[i].d) ); + const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); const __m128i lowMask = _mm_set1_epi8(0xF); const __m128i off = _mm_set1_epi8(8); @@ -2494,7 +2549,7 @@ static void wsp_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const vo _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = _mm_set1_ps( WSP_GGML_FP16_TO_FP32(x[0].d) * WSP_GGML_FP16_TO_FP32(y[0].d) ); + const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) ); const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs); @@ -2512,7 +2567,7 @@ static void wsp_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const vo _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = _mm_set1_ps( WSP_GGML_FP16_TO_FP32(x[1].d) * WSP_GGML_FP16_TO_FP32(y[1].d) ); + const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) ); const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs); @@ -2540,12 +2595,13 @@ static void wsp_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const vo } // Main loop + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 2; i < nb; i+=2) { _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0); _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = _mm_set1_ps( WSP_GGML_FP16_TO_FP32(x[i].d) * WSP_GGML_FP16_TO_FP32(y[i].d) ); + const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs); @@ -2563,7 +2619,7 @@ static void wsp_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const vo _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = _mm_set1_ps( WSP_GGML_FP16_TO_FP32(x[i + 1].d) * WSP_GGML_FP16_TO_FP32(y[i + 1].d) ); + const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) ); const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs); @@ -2597,6 +2653,41 @@ static void wsp_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const vo } *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl); + vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -2611,19 +2702,18 @@ static void wsp_ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const vo sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } - sumf += sumi*WSP_GGML_FP16_TO_FP32(x[i].d)*WSP_GGML_FP16_TO_FP32(y[i].d); + sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); } *s = sumf; #endif } -static void wsp_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); const block_q4_1 * restrict x = vx; const block_q8_1 * restrict y = vy; @@ -2635,13 +2725,14 @@ static void wsp_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const vo float summs = 0; + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q4_1 * restrict x0 = &x[i + 0]; const block_q4_1 * restrict x1 = &x[i + 1]; const block_q8_1 * restrict y0 = &y[i + 0]; const block_q8_1 * restrict y1 = &y[i + 1]; - summs += WSP_GGML_FP16_TO_FP32(x0->m) * y0->s + WSP_GGML_FP16_TO_FP32(x1->m) * y1->s; + summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -2665,8 +2756,8 @@ static void wsp_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const vo const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), WSP_GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), WSP_GGML_FP16_TO_FP32(x1->d)*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l)); @@ -2683,8 +2774,8 @@ static void wsp_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const vo const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), WSP_GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), WSP_GGML_FP16_TO_FP32(x1->d)*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d); #endif } @@ -2697,10 +2788,10 @@ static void wsp_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const vo // Main loop for (int i = 0; i < nb; ++i) { - const float d0 = WSP_GGML_FP16_TO_FP32(x[i].d); + const float d0 = GGML_FP16_TO_FP32(x[i].d); const float d1 = y[i].d; - summs += WSP_GGML_FP16_TO_FP32(x[i].m) * y[i].s; + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; const __m256 d0v = _mm256_set1_ps( d0 ); const __m256 d1v = _mm256_set1_ps( d1 ); @@ -2723,6 +2814,38 @@ static void wsp_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const vo } *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -2737,19 +2860,18 @@ static void wsp_ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const vo sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } - sumf += (WSP_GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + WSP_GGML_FP16_TO_FP32(x[i].m)*y[i].s; + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; } *s = sumf; #endif } -static void wsp_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); assert(qk == QK5_0); const block_q5_0 * restrict x = vx; @@ -2765,6 +2887,7 @@ static void wsp_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const vo uint64_t tmp0[4]; uint64_t tmp1[4]; + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q5_0 * restrict x0 = &x[i]; const block_q5_0 * restrict x1 = &x[i + 1]; @@ -2816,10 +2939,10 @@ static void wsp_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const vo #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), WSP_GGML_FP16_TO_FP32(x0->d)*WSP_GGML_FP16_TO_FP32(y0->d)); + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), WSP_GGML_FP16_TO_FP32(x1->d)*WSP_GGML_FP16_TO_FP32(y1->d)); + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); @@ -2836,8 +2959,8 @@ static void wsp_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const vo const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), WSP_GGML_FP16_TO_FP32(x0->d)*WSP_GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), WSP_GGML_FP16_TO_FP32(x1->d)*WSP_GGML_FP16_TO_FP32(y1->d)); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #endif } @@ -2898,7 +3021,7 @@ static void wsp_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const vo wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(WSP_GGML_FP16_TO_FP32(x0->d) * WSP_GGML_FP16_TO_FP32(y0->d)))); + wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d)))); } *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + @@ -2910,7 +3033,7 @@ static void wsp_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const vo // Main loop for (int i = 0; i < nb; i++) { /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(WSP_GGML_FP16_TO_FP32(x[i].d) * WSP_GGML_FP16_TO_FP32(y[i].d)); + const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); __m256i bx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -2934,7 +3057,7 @@ static void wsp_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const vo // Main loop for (int i = 0; i < nb; i++) { /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(WSP_GGML_FP16_TO_FP32(x[i].d) * WSP_GGML_FP16_TO_FP32(y[i].d)); + const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); __m256i bx = bytes_from_nibbles_32(x[i].qs); const __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -2957,6 +3080,76 @@ static void wsp_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const vo } *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + // These temp values are for masking and shift operations + uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000}; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // temporary registers + vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl); + vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl); + vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl); + vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl); + + // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl); + vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl); + vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl); + + // ((qh & (1u << (j + 16))) >> (j + 12)); + vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl); + vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl); + + // narrowing + vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl); + vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl); + + vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl); + vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl); + + // load + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl); + vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl); + + vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl); + vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -2977,19 +3170,18 @@ static void wsp_ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const vo sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } - sumf += (WSP_GGML_FP16_TO_FP32(x[i].d)*WSP_GGML_FP16_TO_FP32(y[i].d)) * sumi; + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; } *s = sumf; #endif } -static void wsp_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); assert(qk == QK5_1); const block_q5_1 * restrict x = vx; @@ -3008,6 +3200,7 @@ static void wsp_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const vo uint64_t tmp0[4]; uint64_t tmp1[4]; + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q5_1 * restrict x0 = &x[i]; const block_q5_1 * restrict x1 = &x[i + 1]; @@ -3016,8 +3209,8 @@ static void wsp_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const vo const uint8x16_t m4b = vdupq_n_u8(0x0F); - summs0 += WSP_GGML_FP16_TO_FP32(x0->m) * y0->s; - summs1 += WSP_GGML_FP16_TO_FP32(x1->m) * y1->s; + summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s; + summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s; // extract the 5th bit via lookup table ((b) << 4) memcpy(&qh0, x0->qh, sizeof(qh0)); @@ -3062,10 +3255,10 @@ static void wsp_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const vo #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), WSP_GGML_FP16_TO_FP32(x0->d)*y0->d); + vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), WSP_GGML_FP16_TO_FP32(x1->d)*y1->d); + vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d); #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l)); @@ -3082,8 +3275,8 @@ static void wsp_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const vo const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h)); const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), WSP_GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), WSP_GGML_FP16_TO_FP32(x1->d)*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d); #endif } @@ -3101,7 +3294,7 @@ static void wsp_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const vo const block_q5_1 * restrict x0 = &x[i]; const block_q8_1 * restrict y0 = &y[i]; - summs += WSP_GGML_FP16_TO_FP32(x0->m) * y0->s; + summs += GGML_FP16_TO_FP32(x0->m) * y0->s; const v128_t m4b = wasm_i8x16_splat(0x0F); @@ -3148,7 +3341,7 @@ static void wsp_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const vo wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(WSP_GGML_FP16_TO_FP32(x0->d) * y0->d))); + wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d))); } *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + @@ -3161,9 +3354,9 @@ static void wsp_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const vo // Main loop for (int i = 0; i < nb; i++) { - const __m256 dx = _mm256_set1_ps(WSP_GGML_FP16_TO_FP32(x[i].d)); + const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); - summs += WSP_GGML_FP16_TO_FP32(x[i].m) * y[i].s; + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; __m256i bx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -3188,9 +3381,9 @@ static void wsp_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const vo // Main loop for (int i = 0; i < nb; i++) { - const __m256 dx = _mm256_set1_ps(WSP_GGML_FP16_TO_FP32(x[i].d)); + const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); - summs += WSP_GGML_FP16_TO_FP32(x[i].m) * y[i].s; + summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; __m256i bx = bytes_from_nibbles_32(x[i].qs); const __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -3213,6 +3406,72 @@ static void wsp_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const vo } *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + // These temp values are for shift operations + uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // temporary registers + vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl); + vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl); + + // load qh + vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl); + + // ((qh >> (j + 0)) << 4) & 0x10; + vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl); + vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl); + vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl); + + // ((qh >> (j + 12)) ) & 0x10; + vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl); + vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl); + + // narrowing + vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl); + vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl); + + vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl); + vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl); + + // load + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl); + vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl); + + vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3233,19 +3492,18 @@ static void wsp_ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const vo sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } - sumf += (WSP_GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + WSP_GGML_FP16_TO_FP32(x[i].m)*y[i].s; + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; } *s = sumf; #endif } -static void wsp_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); const block_q8_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -3254,6 +3512,7 @@ static void wsp_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const vo float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q8_0 * restrict x0 = &x[i + 0]; const block_q8_0 * restrict x1 = &x[i + 1]; @@ -3274,11 +3533,11 @@ static void wsp_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const vo #if defined(__ARM_FEATURE_DOTPROD) sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), - vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), WSP_GGML_FP16_TO_FP32(x0->d)*WSP_GGML_FP16_TO_FP32(y0->d)); + vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), - vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), WSP_GGML_FP16_TO_FP32(x1->d)*WSP_GGML_FP16_TO_FP32(y1->d)); + vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #else const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0)); @@ -3296,8 +3555,8 @@ static void wsp_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const vo const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1)); const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3)); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), WSP_GGML_FP16_TO_FP32(x0->d)*WSP_GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), WSP_GGML_FP16_TO_FP32(x1->d)*WSP_GGML_FP16_TO_FP32(y1->d)); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); #endif } @@ -3309,7 +3568,7 @@ static void wsp_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const vo // Main loop for (int i = 0; i < nb; ++i) { // Compute combined scale for the block - const __m256 d = _mm256_set1_ps(WSP_GGML_FP16_TO_FP32(x[i].d) * WSP_GGML_FP16_TO_FP32(y[i].d)); + const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d)); __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs); __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); @@ -3324,6 +3583,26 @@ static void wsp_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const vo } *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + size_t vl = __riscv_vsetvl_e8m1(qk); + + for (int i = 0; i < nb; i++) { + // load elements + vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl); + vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl); + + vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl); + + vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); + + sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3335,84 +3614,84 @@ static void wsp_ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const vo sumi += x[i].qs[j]*y[i].qs[j]; } - sumf += sumi*(WSP_GGML_FP16_TO_FP32(x[i].d)*WSP_GGML_FP16_TO_FP32(y[i].d)); + sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); } *s = sumf; #endif } -// compute WSP_GGML_VEC_DOT_UNROLL dot products at once +// compute GGML_VEC_DOT_UNROLL dot products at once // xs - x row stride in bytes -inline static void wsp_ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, wsp_ggml_fp16_t * restrict y) { - wsp_ggml_float sumf[WSP_GGML_VEC_DOT_UNROLL] = { 0.0 }; +inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { + ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 }; - wsp_ggml_fp16_t * restrict x[WSP_GGML_VEC_DOT_UNROLL]; + ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL]; - for (int i = 0; i < WSP_GGML_VEC_DOT_UNROLL; ++i) { - x[i] = (wsp_ggml_fp16_t *) ((char *) xv + i*xs); + for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { + x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); } -#if defined(WSP_GGML_SIMD) - const int np = (n & ~(WSP_GGML_F16_STEP - 1)); +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F16_STEP - 1)); - WSP_GGML_F16_VEC sum[WSP_GGML_VEC_DOT_UNROLL][WSP_GGML_F16_ARR] = { { WSP_GGML_F16_VEC_ZERO } }; + GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; - WSP_GGML_F16_VEC ax[WSP_GGML_F16_ARR]; - WSP_GGML_F16_VEC ay[WSP_GGML_F16_ARR]; + GGML_F16_VEC ax[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; - for (int i = 0; i < np; i += WSP_GGML_F16_STEP) { - for (int j = 0; j < WSP_GGML_F16_ARR; j++) { - ay[j] = WSP_GGML_F16_VEC_LOAD(y + i + j*WSP_GGML_F16_EPR, j); + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) { - ax[j] = WSP_GGML_F16_VEC_LOAD(x[k] + i + j*WSP_GGML_F16_EPR, j); + for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { + ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j); - sum[k][j] = WSP_GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); + sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); } } } // reduce sum0..sum3 to sum0 - for (int k = 0; k < WSP_GGML_VEC_DOT_UNROLL; ++k) { - WSP_GGML_F16_VEC_REDUCE(sumf[k], sum[k]); + for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { + GGML_F16_VEC_REDUCE(sumf[k], sum[k]); } // leftovers for (int i = np; i < n; ++i) { - for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (wsp_ggml_float)(WSP_GGML_FP16_TO_FP32(x[j][i])*WSP_GGML_FP16_TO_FP32(y[i])); + for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { + sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); } } #else for (int i = 0; i < n; ++i) { - for (int j = 0; j < WSP_GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (wsp_ggml_float)(WSP_GGML_FP16_TO_FP32(x[j][i])*WSP_GGML_FP16_TO_FP32(y[i])); + for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { + sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); } } #endif - for (int i = 0; i < WSP_GGML_VEC_DOT_UNROLL; ++i) { + for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { s[i] = sumf[i]; } } -inline static void wsp_ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { -#if defined(WSP_GGML_SIMD) - const int np = (n & ~(WSP_GGML_F32_STEP - 1)); +inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F32_STEP - 1)); - WSP_GGML_F32_VEC vx = WSP_GGML_F32_VEC_SET1(v); + GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); - WSP_GGML_F32_VEC ax[WSP_GGML_F32_ARR]; - WSP_GGML_F32_VEC ay[WSP_GGML_F32_ARR]; + GGML_F32_VEC ax[GGML_F32_ARR]; + GGML_F32_VEC ay[GGML_F32_ARR]; - for (int i = 0; i < np; i += WSP_GGML_F32_STEP) { - for (int j = 0; j < WSP_GGML_F32_ARR; j++) { - ax[j] = WSP_GGML_F32_VEC_LOAD(x + i + j*WSP_GGML_F32_EPR); - ay[j] = WSP_GGML_F32_VEC_LOAD(y + i + j*WSP_GGML_F32_EPR); - ay[j] = WSP_GGML_F32_VEC_FMA(ay[j], ax[j], vx); + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx); - WSP_GGML_F32_VEC_STORE(y + i + j*WSP_GGML_F32_EPR, ay[j]); + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); } } @@ -3428,21 +3707,23 @@ inline static void wsp_ggml_vec_mad_f32(const int n, float * restrict y, const f #endif } -//inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } -inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float v) { -#if defined(WSP_GGML_SIMD) - const int np = (n & ~(WSP_GGML_F32_STEP - 1)); +//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } +inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { +#if defined(GGML_USE_ACCELERATE) + vDSP_vsmul(y, 1, &v, y, 1, n); +#elif defined(GGML_SIMD) + const int np = (n & ~(GGML_F32_STEP - 1)); - WSP_GGML_F32_VEC vx = WSP_GGML_F32_VEC_SET1(v); + GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); - WSP_GGML_F32_VEC ay[WSP_GGML_F32_ARR]; + GGML_F32_VEC ay[GGML_F32_ARR]; - for (int i = 0; i < np; i += WSP_GGML_F32_STEP) { - for (int j = 0; j < WSP_GGML_F32_ARR; j++) { - ay[j] = WSP_GGML_F32_VEC_LOAD(y + i + j*WSP_GGML_F32_EPR); - ay[j] = WSP_GGML_F32_VEC_MUL(ay[j], vx); + for (int i = 0; i < np; i += GGML_F32_STEP) { + for (int j = 0; j < GGML_F32_ARR; j++) { + ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); + ay[j] = GGML_F32_VEC_MUL(ay[j], vx); - WSP_GGML_F32_VEC_STORE(y + i + j*WSP_GGML_F32_EPR, ay[j]); + GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]); } } @@ -3458,134 +3739,134 @@ inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float #endif } -inline static void wsp_ggml_vec_norm_f32 (const int n, float * s, const float * x) { wsp_ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); } -inline static void wsp_ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } -inline static void wsp_ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } -inline static void wsp_ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } -inline static void wsp_ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } -inline static void wsp_ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } -inline static void wsp_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } -inline static void wsp_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } -inline static void wsp_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } -inline static void wsp_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } +inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); } +inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } +inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } +inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } +inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } +inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } +inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } +inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } +inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } +inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } -static const float GELU_COEF_A = 0.044715f; -static const float GELU_QUICK_COEF = -1.702f; -static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; +static const float GELU_COEF_A = 0.044715f; +static const float GELU_QUICK_COEF = -1.702f; +static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; -inline static float wsp_ggml_gelu_f32(float x) { +inline static float ggml_gelu_f32(float x) { return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); } -inline static void wsp_ggml_vec_gelu_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) { +inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { const uint16_t * i16 = (const uint16_t *) x; for (int i = 0; i < n; ++i) { y[i] = table_gelu_f16[i16[i]]; } } -#ifdef WSP_GGML_GELU_FP16 -inline static void wsp_ggml_vec_gelu_f32(const int n, float * y, const float * x) { +#ifdef GGML_GELU_FP16 +inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { uint16_t t; for (int i = 0; i < n; ++i) { - wsp_ggml_fp16_t fp16 = WSP_GGML_FP32_TO_FP16(x[i]); + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = WSP_GGML_FP16_TO_FP32(table_gelu_f16[t]); + y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]); } } #else -inline static void wsp_ggml_vec_gelu_f32(const int n, float * y, const float * x) { +inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) { - y[i] = wsp_ggml_gelu_f32(x[i]); + y[i] = ggml_gelu_f32(x[i]); } } #endif -inline static float wsp_ggml_gelu_quick_f32(float x) { +inline static float ggml_gelu_quick_f32(float x) { return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x))); } -//inline static void wsp_ggml_vec_gelu_quick_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) { +//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { // const uint16_t * i16 = (const uint16_t *) x; // for (int i = 0; i < n; ++i) { // y[i] = table_gelu_quick_f16[i16[i]]; // } //} -#ifdef WSP_GGML_GELU_QUICK_FP16 -inline static void wsp_ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { +#ifdef GGML_GELU_QUICK_FP16 +inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { uint16_t t; for (int i = 0; i < n; ++i) { - wsp_ggml_fp16_t fp16 = WSP_GGML_FP32_TO_FP16(x[i]); + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = WSP_GGML_FP16_TO_FP32(table_gelu_quick_f16[t]); + y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]); } } #else -inline static void wsp_ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { +inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) { - y[i] = wsp_ggml_gelu_quick_f32(x[i]); + y[i] = ggml_gelu_quick_f32(x[i]); } } #endif // Sigmoid Linear Unit (SiLU) function -inline static float wsp_ggml_silu_f32(float x) { +inline static float ggml_silu_f32(float x) { return x/(1.0f + expf(-x)); } -//inline static void wsp_ggml_vec_silu_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x) { +//inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { // const uint16_t * i16 = (const uint16_t *) x; // for (int i = 0; i < n; ++i) { // y[i] = table_silu_f16[i16[i]]; // } //} -#ifdef WSP_GGML_SILU_FP16 -inline static void wsp_ggml_vec_silu_f32(const int n, float * y, const float * x) { +#ifdef GGML_SILU_FP16 +inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) { uint16_t t; for (int i = 0; i < n; ++i) { - wsp_ggml_fp16_t fp16 = WSP_GGML_FP32_TO_FP16(x[i]); + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = WSP_GGML_FP16_TO_FP32(table_silu_f16[t]); + y[i] = GGML_FP16_TO_FP32(table_silu_f16[t]); } } #else -inline static void wsp_ggml_vec_silu_f32(const int n, float * y, const float * x) { +inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) { - y[i] = wsp_ggml_silu_f32(x[i]); + y[i] = ggml_silu_f32(x[i]); } } #endif -inline static float wsp_ggml_silu_backward_f32(float x, float dy) { +inline static float ggml_silu_backward_f32(float x, float dy) { const float s = 1.0f/(1.0f + expf(-x)); return dy*s*(1.0f + x*(1.0f - s)); } -#ifdef WSP_GGML_SILU_FP16 -inline static void wsp_ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { +#ifdef GGML_SILU_FP16 +inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { for (int i = 0; i < n; ++i) { // we did not use x[i] to compute forward silu but its f16 equivalent // take derivative at f16 of x[i]: - wsp_ggml_fp16_t fp16 = WSP_GGML_FP32_TO_FP16(x[i]); - float usedx = WSP_GGML_FP16_TO_FP32(fp16); - dx[i] = wsp_ggml_silu_backward_f32(usedx, dy[i]); + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + float usedx = GGML_FP16_TO_FP32(fp16); + dx[i] = ggml_silu_backward_f32(usedx, dy[i]); } } #else -inline static void wsp_ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { +inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { for (int i = 0; i < n; ++i) { - dx[i] = wsp_ggml_silu_backward_f32(x[i], dy[i]); + dx[i] = ggml_silu_backward_f32(x[i], dy[i]); } } #endif -inline static void wsp_ggml_vec_sum_f32(const int n, float * s, const float * x) { -#ifndef WSP_GGML_USE_ACCELERATE - wsp_ggml_float sum = 0.0; +inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { +#ifndef GGML_USE_ACCELERATE + ggml_float sum = 0.0; for (int i = 0; i < n; ++i) { - sum += (wsp_ggml_float)x[i]; + sum += (ggml_float)x[i]; } *s = sum; #else @@ -3593,16 +3874,24 @@ inline static void wsp_ggml_vec_sum_f32(const int n, float * s, const float * x) #endif } -inline static void wsp_ggml_vec_sum_ggf(const int n, wsp_ggml_float * s, const float * x) { - wsp_ggml_float sum = 0.0; +inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) { + ggml_float sum = 0.0; for (int i = 0; i < n; ++i) { - sum += (wsp_ggml_float)x[i]; + sum += (ggml_float)x[i]; } *s = sum; } -inline static void wsp_ggml_vec_max_f32(const int n, float * s, const float * x) { -#ifndef WSP_GGML_USE_ACCELERATE +inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) { + float sum = 0.0f; + for (int i = 0; i < n; ++i) { + sum += GGML_FP16_TO_FP32(x[i]); + } + *s = sum; +} + +inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { +#ifndef GGML_USE_ACCELERATE float max = -INFINITY; for (int i = 0; i < n; ++i) { max = MAX(max, x[i]); @@ -3613,12 +3902,12 @@ inline static void wsp_ggml_vec_max_f32(const int n, float * s, const float * x) #endif } -inline static void wsp_ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { - wsp_ggml_vec_norm_f32(n, s, x); +inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { + ggml_vec_norm_f32(n, s, x); *s = 1.f/(*s); } -inline static void wsp_ggml_vec_argmax_f32(const int n, int * s, const float * x) { +inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) { float max = -INFINITY; int idx = 0; for (int i = 0; i < n; ++i) { @@ -3632,96 +3921,7 @@ inline static void wsp_ggml_vec_argmax_f32(const int n, int * s, const float * x // data types // -static const int WSP_GGML_BLCK_SIZE[WSP_GGML_TYPE_COUNT] = { - [WSP_GGML_TYPE_F32] = 1, - [WSP_GGML_TYPE_F16] = 1, - [WSP_GGML_TYPE_Q4_0] = QK4_0, - [WSP_GGML_TYPE_Q4_1] = QK4_1, - [WSP_GGML_TYPE_Q5_0] = QK5_0, - [WSP_GGML_TYPE_Q5_1] = QK5_1, - [WSP_GGML_TYPE_Q8_0] = QK8_0, - [WSP_GGML_TYPE_Q8_1] = QK8_1, -#ifdef WSP_GGML_USE_K_QUANTS - [WSP_GGML_TYPE_Q2_K] = QK_K, - [WSP_GGML_TYPE_Q3_K] = QK_K, - [WSP_GGML_TYPE_Q4_K] = QK_K, - [WSP_GGML_TYPE_Q5_K] = QK_K, - [WSP_GGML_TYPE_Q6_K] = QK_K, - [WSP_GGML_TYPE_Q8_K] = QK_K, -#endif - [WSP_GGML_TYPE_I8] = 1, - [WSP_GGML_TYPE_I16] = 1, - [WSP_GGML_TYPE_I32] = 1, -}; -static_assert(WSP_GGML_TYPE_COUNT == 19, "WSP_GGML_BLCK_SIZE is outdated"); - -static const size_t WSP_GGML_TYPE_SIZE[WSP_GGML_TYPE_COUNT] = { - [WSP_GGML_TYPE_F32] = sizeof(float), - [WSP_GGML_TYPE_F16] = sizeof(wsp_ggml_fp16_t), - [WSP_GGML_TYPE_Q4_0] = sizeof(block_q4_0), - [WSP_GGML_TYPE_Q4_1] = sizeof(block_q4_1), - [WSP_GGML_TYPE_Q5_0] = sizeof(block_q5_0), - [WSP_GGML_TYPE_Q5_1] = sizeof(block_q5_1), - [WSP_GGML_TYPE_Q8_0] = sizeof(block_q8_0), - [WSP_GGML_TYPE_Q8_1] = sizeof(block_q8_1), -#ifdef WSP_GGML_USE_K_QUANTS - [WSP_GGML_TYPE_Q2_K] = sizeof(block_q2_K), - [WSP_GGML_TYPE_Q3_K] = sizeof(block_q3_K), - [WSP_GGML_TYPE_Q4_K] = sizeof(block_q4_K), - [WSP_GGML_TYPE_Q5_K] = sizeof(block_q5_K), - [WSP_GGML_TYPE_Q6_K] = sizeof(block_q6_K), - [WSP_GGML_TYPE_Q8_K] = sizeof(block_q8_K), -#endif - [WSP_GGML_TYPE_I8] = sizeof(int8_t), - [WSP_GGML_TYPE_I16] = sizeof(int16_t), - [WSP_GGML_TYPE_I32] = sizeof(int32_t), -}; -static_assert(WSP_GGML_TYPE_COUNT == 19, "WSP_GGML_TYPE_SIZE is outdated"); - - -static const char * WSP_GGML_TYPE_NAME[WSP_GGML_TYPE_COUNT] = { - [WSP_GGML_TYPE_F32] = "f32", - [WSP_GGML_TYPE_F16] = "f16", - [WSP_GGML_TYPE_Q4_0] = "q4_0", - [WSP_GGML_TYPE_Q4_1] = "q4_1", - [WSP_GGML_TYPE_Q5_0] = "q5_0", - [WSP_GGML_TYPE_Q5_1] = "q5_1", - [WSP_GGML_TYPE_Q8_0] = "q8_0", - [WSP_GGML_TYPE_Q8_1] = "q8_1", - [WSP_GGML_TYPE_Q2_K] = "q2_K", - [WSP_GGML_TYPE_Q3_K] = "q3_K", - [WSP_GGML_TYPE_Q4_K] = "q4_K", - [WSP_GGML_TYPE_Q5_K] = "q5_K", - [WSP_GGML_TYPE_Q6_K] = "q6_K", - [WSP_GGML_TYPE_Q8_K] = "q8_K", - [WSP_GGML_TYPE_I8] = "i8", - [WSP_GGML_TYPE_I16] = "i16", - [WSP_GGML_TYPE_I32] = "i32", -}; -static_assert(WSP_GGML_TYPE_COUNT == 19, "WSP_GGML_TYPE_NAME is outdated"); - -static bool WSP_GGML_IS_QUANTIZED[WSP_GGML_TYPE_COUNT] = { - [WSP_GGML_TYPE_F32] = false, - [WSP_GGML_TYPE_F16] = false, - [WSP_GGML_TYPE_Q4_0] = true, - [WSP_GGML_TYPE_Q4_1] = true, - [WSP_GGML_TYPE_Q5_0] = true, - [WSP_GGML_TYPE_Q5_1] = true, - [WSP_GGML_TYPE_Q8_0] = true, - [WSP_GGML_TYPE_Q8_1] = true, - [WSP_GGML_TYPE_Q2_K] = true, - [WSP_GGML_TYPE_Q3_K] = true, - [WSP_GGML_TYPE_Q4_K] = true, - [WSP_GGML_TYPE_Q5_K] = true, - [WSP_GGML_TYPE_Q6_K] = true, - [WSP_GGML_TYPE_Q8_K] = true, - [WSP_GGML_TYPE_I8] = false, - [WSP_GGML_TYPE_I16] = false, - [WSP_GGML_TYPE_I32] = false, -}; -static_assert(WSP_GGML_TYPE_COUNT == 19, "WSP_GGML_IS_QUANTIZED is outdated"); - -static const char * WSP_GGML_OP_NAME[WSP_GGML_OP_COUNT] = { +static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "NONE", "DUP", @@ -3740,20 +3940,12 @@ static const char * WSP_GGML_OP_NAME[WSP_GGML_OP_COUNT] = { "ARGMAX", "REPEAT", "REPEAT_BACK", - "ABS", - "SGN", - "NEG", - "STEP", - "TANH", - "ELU", - "RELU", - "GELU", - "GELU_QUICK", - "SILU", + "CONCAT", "SILU_BACK", "NORM", "RMS_NORM", "RMS_NORM_BACK", + "GROUP_NORM", "MUL_MAT", "OUT_PROD", @@ -3779,16 +3971,28 @@ static const char * WSP_GGML_OP_NAME[WSP_GGML_OP_COUNT] = { "CLAMP", "CONV_1D", "CONV_2D", + "CONV_TRANSPOSE_2D", + "POOL_1D", + "POOL_2D", + "UPSCALE", "FLASH_ATTN", "FLASH_FF", "FLASH_ATTN_BACK", "WIN_PART", "WIN_UNPART", + "GET_REL_POS", + "ADD_REL_POS", + + "UNARY", "MAP_UNARY", "MAP_BINARY", + "MAP_CUSTOM1_F32", + "MAP_CUSTOM2_F32", + "MAP_CUSTOM3_F32", + "MAP_CUSTOM1", "MAP_CUSTOM2", "MAP_CUSTOM3", @@ -3797,9 +4001,9 @@ static const char * WSP_GGML_OP_NAME[WSP_GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(WSP_GGML_OP_COUNT == 66, "WSP_GGML_OP_COUNT != 66"); +static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); -static const char * WSP_GGML_OP_SYMBOL[WSP_GGML_OP_COUNT] = { +static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", "x", @@ -3818,20 +4022,12 @@ static const char * WSP_GGML_OP_SYMBOL[WSP_GGML_OP_COUNT] = { "argmax(x)", "repeat(x)", "repeat_back(x)", - "abs(x)", - "sgn(x)", - "-x", - "step(x)", - "tanh(x)", - "elu(x)", - "relu(x)", - "gelu(x)", - "gelu_quick(x)", - "silu(x)", + "concat(x, y)", "silu_back(x)", "norm(x)", "rms_norm(x)", "rms_norm_back(x)", + "group_norm(x)", "X*Y", "X*Y", @@ -3857,16 +4053,28 @@ static const char * WSP_GGML_OP_SYMBOL[WSP_GGML_OP_COUNT] = { "clamp(x)", "conv_1d(x)", "conv_2d(x)", + "conv_transpose_2d(x)", + "pool_1d(x)", + "pool_2d(x)", + "upscale(x)", "flash_attn(x)", "flash_ff(x)", "flash_attn_back(x)", "win_part(x)", "win_unpart(x)", + "get_rel_pos(x)", + "add_rel_pos(x)", + + "unary(x)", "f(x)", "f(x,y)", + "custom_f32(x)", + "custom_f32(x,y)", + "custom_f32(x,y,z)", + "custom(x)", "custom(x,y)", "custom(x,y,z)", @@ -3875,10 +4083,12 @@ static const char * WSP_GGML_OP_SYMBOL[WSP_GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(WSP_GGML_OP_COUNT == 66, "WSP_GGML_OP_COUNT != 66"); +static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68"); -static_assert(sizeof(struct wsp_ggml_object)%WSP_GGML_MEM_ALIGN == 0, "wsp_ggml_object size must be a multiple of WSP_GGML_MEM_ALIGN"); -static_assert(sizeof(struct wsp_ggml_tensor)%WSP_GGML_MEM_ALIGN == 0, "wsp_ggml_tensor size must be a multiple of WSP_GGML_MEM_ALIGN"); +static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); + +static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); +static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); // WARN: // Mis-confguration can lead to problem that's hard to reason about: @@ -3886,31 +4096,33 @@ static_assert(sizeof(struct wsp_ggml_tensor)%WSP_GGML_MEM_ALIGN == 0, "wsp_ggml_ // * At worst it talks slightly difference but hard to perceive. // // An op has to enable INIT or FINALIZE when any of it's branch needs that pass. -// Take care about compile options (e.g., WSP_GGML_USE_xxx). -static bool WSP_GGML_OP_HAS_INIT [WSP_GGML_OP_COUNT] = { 0 }; -static bool WSP_GGML_OP_HAS_FINALIZE[WSP_GGML_OP_COUNT] = { 0 }; +// Take care about compile options (e.g., GGML_USE_xxx). +static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 }; +static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 }; -static void wsp_ggml_setup_op_has_task_pass(void) { +static void ggml_setup_op_has_task_pass(void) { { // INIT - bool * p = WSP_GGML_OP_HAS_INIT; - - p[WSP_GGML_OP_ACC ] = true; - p[WSP_GGML_OP_MUL_MAT ] = true; - p[WSP_GGML_OP_OUT_PROD ] = true; - p[WSP_GGML_OP_SET ] = true; - p[WSP_GGML_OP_GET_ROWS_BACK ] = true; - p[WSP_GGML_OP_DIAG_MASK_INF ] = true; - p[WSP_GGML_OP_DIAG_MASK_ZERO ] = true; - p[WSP_GGML_OP_CONV_1D ] = true; - p[WSP_GGML_OP_CONV_2D ] = true; - p[WSP_GGML_OP_FLASH_ATTN_BACK ] = true; - p[WSP_GGML_OP_CROSS_ENTROPY_LOSS ] = true; + bool * p = GGML_OP_HAS_INIT; + + p[GGML_OP_ACC ] = true; + p[GGML_OP_MUL_MAT ] = true; + p[GGML_OP_OUT_PROD ] = true; + p[GGML_OP_SET ] = true; + p[GGML_OP_GET_ROWS_BACK ] = true; + p[GGML_OP_DIAG_MASK_INF ] = true; + p[GGML_OP_DIAG_MASK_ZERO ] = true; + p[GGML_OP_CONV_1D ] = true; + p[GGML_OP_CONV_2D ] = true; + p[GGML_OP_CONV_TRANSPOSE_2D ] = true; + p[GGML_OP_FLASH_ATTN_BACK ] = true; + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + p[GGML_OP_ADD_REL_POS ] = true; } { // FINALIZE - bool * p = WSP_GGML_OP_HAS_FINALIZE; + bool * p = GGML_OP_HAS_FINALIZE; - p[WSP_GGML_OP_CROSS_ENTROPY_LOSS ] = true; + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; } } @@ -3918,7 +4130,7 @@ static void wsp_ggml_setup_op_has_task_pass(void) { // ggml context // -struct wsp_ggml_context { +struct ggml_context { size_t mem_size; void * mem_buffer; bool mem_buffer_owned; @@ -3927,33 +4139,33 @@ struct wsp_ggml_context { int n_objects; - struct wsp_ggml_object * objects_begin; - struct wsp_ggml_object * objects_end; + struct ggml_object * objects_begin; + struct ggml_object * objects_end; - struct wsp_ggml_scratch scratch; - struct wsp_ggml_scratch scratch_save; + struct ggml_scratch scratch; + struct ggml_scratch scratch_save; }; -struct wsp_ggml_context_container { +struct ggml_context_container { bool used; - struct wsp_ggml_context context; + struct ggml_context context; }; // // NUMA support // -#define WSP_GGML_NUMA_MAX_NODES 8 -#define WSP_GGML_NUMA_MAX_CPUS 512 +#define GGML_NUMA_MAX_NODES 8 +#define GGML_NUMA_MAX_CPUS 512 -struct wsp_ggml_numa_node { - uint32_t cpus[WSP_GGML_NUMA_MAX_CPUS]; // hardware threads on this node +struct ggml_numa_node { + uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node uint32_t n_cpus; }; -struct wsp_ggml_numa_nodes { - struct wsp_ggml_numa_node nodes[WSP_GGML_NUMA_MAX_NODES]; +struct ggml_numa_nodes { + struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES]; uint32_t n_nodes; uint32_t total_cpus; // hardware threads on system }; @@ -3962,17 +4174,17 @@ struct wsp_ggml_numa_nodes { // ggml state // -struct wsp_ggml_state { - struct wsp_ggml_context_container contexts[WSP_GGML_MAX_CONTEXTS]; - struct wsp_ggml_numa_nodes numa; +struct ggml_state { + struct ggml_context_container contexts[GGML_MAX_CONTEXTS]; + struct ggml_numa_nodes numa; }; // global state -static struct wsp_ggml_state g_state; +static struct ggml_state g_state; static atomic_int g_state_barrier = 0; // barrier via spin lock -inline static void wsp_ggml_critical_section_start(void) { +inline static void ggml_critical_section_start(void) { int processing = atomic_fetch_add(&g_state_barrier, 1); while (processing > 0) { @@ -3985,13 +4197,13 @@ inline static void wsp_ggml_critical_section_start(void) { // TODO: make this somehow automatically executed // some sort of "sentry" mechanism -inline static void wsp_ggml_critical_section_end(void) { +inline static void ggml_critical_section_end(void) { atomic_fetch_sub(&g_state_barrier, 1); } -void wsp_ggml_numa_init(void) { +void ggml_numa_init(void) { if (g_state.numa.n_nodes > 0) { - fprintf(stderr, "wsp_ggml_numa_init: NUMA already initialized\n"); + fprintf(stderr, "ggml_numa_init: NUMA already initialized\n"); return; } @@ -4002,22 +4214,22 @@ void wsp_ggml_numa_init(void) { int rv; // enumerate nodes - while (g_state.numa.n_nodes < WSP_GGML_NUMA_MAX_NODES) { + while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) { rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes); - WSP_GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); + GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); if (stat(path, &st) != 0) { break; } ++g_state.numa.n_nodes; } // enumerate CPUs - while (g_state.numa.total_cpus < WSP_GGML_NUMA_MAX_CPUS) { + while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) { rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus); - WSP_GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); + GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); if (stat(path, &st) != 0) { break; } ++g_state.numa.total_cpus; } - WSP_GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus); + GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus); if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) { g_state.numa.n_nodes = 0; @@ -4025,26 +4237,26 @@ void wsp_ggml_numa_init(void) { } for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) { - struct wsp_ggml_numa_node * node = &g_state.numa.nodes[n]; - WSP_GGML_PRINT_DEBUG("CPUs on node %u:", n); + struct ggml_numa_node * node = &g_state.numa.nodes[n]; + GGML_PRINT_DEBUG("CPUs on node %u:", n); node->n_cpus = 0; for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) { rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c); - WSP_GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); + GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); if (stat(path, &st) == 0) { node->cpus[node->n_cpus++] = c; - WSP_GGML_PRINT_DEBUG(" %u", c); + GGML_PRINT_DEBUG(" %u", c); } } - WSP_GGML_PRINT_DEBUG("\n"); + GGML_PRINT_DEBUG("\n"); } - if (wsp_ggml_is_numa()) { + if (ggml_is_numa()) { FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r"); if (fptr != NULL) { char buf[42]; if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) { - WSP_GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n"); + GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n"); } fclose(fptr); } @@ -4054,114 +4266,131 @@ void wsp_ggml_numa_init(void) { #endif } -bool wsp_ggml_is_numa(void) { +bool ggml_is_numa(void) { return g_state.numa.n_nodes > 1; } //////////////////////////////////////////////////////////////////////////////// -void wsp_ggml_print_object(const struct wsp_ggml_object * obj) { - WSP_GGML_PRINT(" - wsp_ggml_object: offset = %zu, size = %zu, next = %p\n", - obj->offs, obj->size, (const void *) obj->next); +void ggml_print_object(const struct ggml_object * obj) { + GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n", + obj->type, obj->offs, obj->size, (const void *) obj->next); } -void wsp_ggml_print_objects(const struct wsp_ggml_context * ctx) { - struct wsp_ggml_object * obj = ctx->objects_begin; +void ggml_print_objects(const struct ggml_context * ctx) { + struct ggml_object * obj = ctx->objects_begin; - WSP_GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx); + GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx); while (obj != NULL) { - wsp_ggml_print_object(obj); + ggml_print_object(obj); obj = obj->next; } - WSP_GGML_PRINT("%s: --- end ---\n", __func__); + GGML_PRINT("%s: --- end ---\n", __func__); } -int64_t wsp_ggml_nelements(const struct wsp_ggml_tensor * tensor) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +int64_t ggml_nelements(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -int64_t wsp_ggml_nrows(const struct wsp_ggml_tensor * tensor) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +int64_t ggml_nrows(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[1]*tensor->ne[2]*tensor->ne[3]; } -size_t wsp_ggml_nbytes(const struct wsp_ggml_tensor * tensor) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +size_t ggml_nbytes(const struct ggml_tensor * tensor) { + size_t nbytes; + size_t blck_size = ggml_blck_size(tensor->type); + if (blck_size == 1) { + nbytes = ggml_type_size(tensor->type); + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; + } + } + else { + nbytes = tensor->ne[0]*tensor->nb[0]/blck_size; + for (int i = 1; i < GGML_MAX_DIMS; ++i) { + nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; + } + } + + return nbytes; +} + +size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { + return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); +} - // this should handle cases where the tensor is not contiguous in memory - // probaby just: - // - // return tensor->ne[3]*tensor->nb[3] - // - // is enough, but just in case, adding the second part +size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - return MAX(tensor->ne[3]*tensor->nb[3], (wsp_ggml_nelements(tensor)*WSP_GGML_TYPE_SIZE[tensor->type])/WSP_GGML_BLCK_SIZE[tensor->type]); + return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type); } -size_t wsp_ggml_nbytes_split(const struct wsp_ggml_tensor * tensor, int nrows_split) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +int ggml_blck_size(enum ggml_type type) { + return type_traits[type].blck_size; +} - return (nrows_split*tensor->ne[0]*WSP_GGML_TYPE_SIZE[tensor->type])/WSP_GGML_BLCK_SIZE[tensor->type]; +size_t ggml_type_size(enum ggml_type type) { + return type_traits[type].type_size; } -int wsp_ggml_blck_size(enum wsp_ggml_type type) { - return WSP_GGML_BLCK_SIZE[type]; +float ggml_type_sizef(enum ggml_type type) { + return ((float)(type_traits[type].type_size))/type_traits[type].blck_size; } -size_t wsp_ggml_type_size(enum wsp_ggml_type type) { - return WSP_GGML_TYPE_SIZE[type]; +const char * ggml_type_name(enum ggml_type type) { + return type_traits[type].type_name; } -float wsp_ggml_type_sizef(enum wsp_ggml_type type) { - return ((float)(WSP_GGML_TYPE_SIZE[type]))/WSP_GGML_BLCK_SIZE[type]; +bool ggml_is_quantized(enum ggml_type type) { + return type_traits[type].is_quantized; } -const char * wsp_ggml_type_name(enum wsp_ggml_type type) { - return WSP_GGML_TYPE_NAME[type]; +const char * ggml_op_name(enum ggml_op op) { + return GGML_OP_NAME[op]; } -const char * wsp_ggml_op_name(enum wsp_ggml_op op) { - return WSP_GGML_OP_NAME[op]; +const char * ggml_op_symbol(enum ggml_op op) { + return GGML_OP_SYMBOL[op]; } -size_t wsp_ggml_element_size(const struct wsp_ggml_tensor * tensor) { - return WSP_GGML_TYPE_SIZE[tensor->type]; +size_t ggml_element_size(const struct ggml_tensor * tensor) { + return ggml_type_size(tensor->type); } -static inline bool wsp_ggml_is_scalar(const struct wsp_ggml_tensor * tensor) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; } -static inline bool wsp_ggml_is_vector(const struct wsp_ggml_tensor * tensor) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_is_vector(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; } -static inline bool wsp_ggml_is_matrix(const struct wsp_ggml_tensor * tensor) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[2] == 1 && tensor->ne[3] == 1; } -static inline bool wsp_ggml_can_mul_mat(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - return - (t0->ne[0] == t1->ne[0]) && - (t0->ne[2] == t1->ne[2]) && - (t0->ne[3] == t1->ne[3]); + return (t0->ne[0] == t1->ne[0]) && + (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable + (t1->ne[3]%t0->ne[3] == 0); } -static inline bool wsp_ggml_can_out_prod(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return (t0->ne[1] == t1->ne[1]) && @@ -4169,70 +4398,75 @@ static inline bool wsp_ggml_can_out_prod(const struct wsp_ggml_tensor * t0, cons (t0->ne[3] == t1->ne[3]); } -bool wsp_ggml_is_quantized(enum wsp_ggml_type type) { - return WSP_GGML_IS_QUANTIZED[type]; -} - -enum wsp_ggml_type wsp_ggml_ftype_to_wsp_ggml_type(enum wsp_ggml_ftype ftype) { - enum wsp_ggml_type wtype = WSP_GGML_TYPE_COUNT; +enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { + enum ggml_type wtype = GGML_TYPE_COUNT; switch (ftype) { - case WSP_GGML_FTYPE_ALL_F32: wtype = WSP_GGML_TYPE_F32; break; - case WSP_GGML_FTYPE_MOSTLY_F16: wtype = WSP_GGML_TYPE_F16; break; - case WSP_GGML_FTYPE_MOSTLY_Q4_0: wtype = WSP_GGML_TYPE_Q4_0; break; - case WSP_GGML_FTYPE_MOSTLY_Q4_1: wtype = WSP_GGML_TYPE_Q4_1; break; - case WSP_GGML_FTYPE_MOSTLY_Q5_0: wtype = WSP_GGML_TYPE_Q5_0; break; - case WSP_GGML_FTYPE_MOSTLY_Q5_1: wtype = WSP_GGML_TYPE_Q5_1; break; - case WSP_GGML_FTYPE_MOSTLY_Q8_0: wtype = WSP_GGML_TYPE_Q8_0; break; - case WSP_GGML_FTYPE_MOSTLY_Q2_K: wtype = WSP_GGML_TYPE_Q2_K; break; - case WSP_GGML_FTYPE_MOSTLY_Q3_K: wtype = WSP_GGML_TYPE_Q3_K; break; - case WSP_GGML_FTYPE_MOSTLY_Q4_K: wtype = WSP_GGML_TYPE_Q4_K; break; - case WSP_GGML_FTYPE_MOSTLY_Q5_K: wtype = WSP_GGML_TYPE_Q5_K; break; - case WSP_GGML_FTYPE_MOSTLY_Q6_K: wtype = WSP_GGML_TYPE_Q6_K; break; - case WSP_GGML_FTYPE_UNKNOWN: wtype = WSP_GGML_TYPE_COUNT; break; - case WSP_GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = WSP_GGML_TYPE_COUNT; break; - } - - WSP_GGML_ASSERT(wtype != WSP_GGML_TYPE_COUNT); + case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break; + case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break; + case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; + case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; + case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; + case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break; + case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; + case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break; + case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break; + case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break; + case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break; + case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break; + case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; + case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; + } + + GGML_ASSERT(wtype != GGML_TYPE_COUNT); return wtype; } -size_t wsp_ggml_tensor_overhead(void) { - return WSP_GGML_OBJECT_SIZE + WSP_GGML_TENSOR_SIZE + 16; +size_t ggml_tensor_overhead(void) { + return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE; } -bool wsp_ggml_is_transposed(const struct wsp_ggml_tensor * tensor) { +bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } -bool wsp_ggml_is_contiguous(const struct wsp_ggml_tensor * tensor) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +bool ggml_is_contiguous(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + tensor->nb[0] == ggml_type_size(tensor->type) && + tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && + tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && + tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; +} + +static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return - tensor->nb[0] == WSP_GGML_TYPE_SIZE[tensor->type] && - tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/WSP_GGML_BLCK_SIZE[tensor->type] && + tensor->nb[0] == ggml_type_size(tensor->type) && tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -bool wsp_ggml_is_permuted(const struct wsp_ggml_tensor * tensor) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +bool ggml_is_permuted(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; } -static inline bool wsp_ggml_is_padded_1d(const struct wsp_ggml_tensor * tensor) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return - tensor->nb[0] == WSP_GGML_TYPE_SIZE[tensor->type] && + tensor->nb[0] == ggml_type_size(tensor->type) && tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -static inline bool wsp_ggml_are_same_shape(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return (t0->ne[0] == t1->ne[0] ) && @@ -4242,8 +4476,8 @@ static inline bool wsp_ggml_are_same_shape(const struct wsp_ggml_tensor * t0, co } // check if t1 can be represented as a repeatition of t0 -static inline bool wsp_ggml_can_repeat(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return (t1->ne[0]%t0->ne[0] == 0) && @@ -4252,67 +4486,67 @@ static inline bool wsp_ggml_can_repeat(const struct wsp_ggml_tensor * t0, const (t1->ne[3]%t0->ne[3] == 0); } -static inline bool wsp_ggml_can_repeat_rows(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1) { - static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function"); +static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - return (t0->ne[0] == t1->ne[0]) && wsp_ggml_can_repeat(t0, t1); + return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1); } -static inline int wsp_ggml_up32(int n) { +static inline int ggml_up32(int n) { return (n + 31) & ~31; } -//static inline int wsp_ggml_up64(int n) { +//static inline int ggml_up64(int n) { // return (n + 63) & ~63; //} -static inline int wsp_ggml_up(int n, int m) { +static inline int ggml_up(int n, int m) { // assert m is a power of 2 - WSP_GGML_ASSERT((m & (m - 1)) == 0); + GGML_ASSERT((m & (m - 1)) == 0); return (n + m - 1) & ~(m - 1); } -// assert that pointer is aligned to WSP_GGML_MEM_ALIGN -#define wsp_ggml_assert_aligned(ptr) \ - WSP_GGML_ASSERT(((uintptr_t) (ptr))%WSP_GGML_MEM_ALIGN == 0) +// assert that pointer is aligned to GGML_MEM_ALIGN +#define ggml_assert_aligned(ptr) \ + GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) //////////////////////////////////////////////////////////////////////////////// -struct wsp_ggml_context * wsp_ggml_init(struct wsp_ggml_init_params params) { +struct ggml_context * ggml_init(struct ggml_init_params params) { // make this function thread safe - wsp_ggml_critical_section_start(); + ggml_critical_section_start(); static bool is_first_call = true; if (is_first_call) { // initialize time system (required on Windows) - wsp_ggml_time_init(); + ggml_time_init(); // initialize GELU, Quick GELU, SILU and EXP F32 tables { - const uint64_t t_start = wsp_ggml_time_us(); UNUSED(t_start); + const uint64_t t_start = ggml_time_us(); UNUSED(t_start); - wsp_ggml_fp16_t ii; + ggml_fp16_t ii; for (int i = 0; i < (1 << 16); ++i) { uint16_t ui = i; memcpy(&ii, &ui, sizeof(ii)); - const float f = table_f32_f16[i] = WSP_GGML_COMPUTE_FP16_TO_FP32(ii); - table_gelu_f16[i] = WSP_GGML_FP32_TO_FP16(wsp_ggml_gelu_f32(f)); - table_gelu_quick_f16[i] = WSP_GGML_FP32_TO_FP16(wsp_ggml_gelu_quick_f32(f)); - table_silu_f16[i] = WSP_GGML_FP32_TO_FP16(wsp_ggml_silu_f32(f)); - table_exp_f16[i] = WSP_GGML_FP32_TO_FP16(expf(f)); + const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii); + table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); + table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); + table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f)); + table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f)); } - const uint64_t t_end = wsp_ggml_time_us(); UNUSED(t_end); + const uint64_t t_end = ggml_time_us(); UNUSED(t_end); - WSP_GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); + GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } // initialize g_state { - const uint64_t t_start = wsp_ggml_time_us(); UNUSED(t_start); + const uint64_t t_start = ggml_time_us(); UNUSED(t_start); - g_state = (struct wsp_ggml_state) { + g_state = (struct ggml_state) { /*.contexts =*/ { { 0 } }, /*.numa =*/ { .n_nodes = 0, @@ -4320,52 +4554,57 @@ struct wsp_ggml_context * wsp_ggml_init(struct wsp_ggml_init_params params) { }, }; - for (int i = 0; i < WSP_GGML_MAX_CONTEXTS; ++i) { + for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) { g_state.contexts[i].used = false; } - const uint64_t t_end = wsp_ggml_time_us(); UNUSED(t_end); + const uint64_t t_end = ggml_time_us(); UNUSED(t_end); - WSP_GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); + GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } -#if defined(WSP_GGML_USE_CUBLAS) - wsp_ggml_init_cublas(); -#elif defined(WSP_GGML_USE_CLBLAST) - wsp_ggml_cl_init(); +#if defined(GGML_USE_CUBLAS) + ggml_init_cublas(); +#elif defined(GGML_USE_CLBLAST) + ggml_cl_init(); #endif - wsp_ggml_setup_op_has_task_pass(); + ggml_setup_op_has_task_pass(); is_first_call = false; } // find non-used context in g_state - struct wsp_ggml_context * ctx = NULL; + struct ggml_context * ctx = NULL; - for (int i = 0; i < WSP_GGML_MAX_CONTEXTS; i++) { + for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { if (!g_state.contexts[i].used) { g_state.contexts[i].used = true; ctx = &g_state.contexts[i].context; - WSP_GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i); + GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i); break; } } if (ctx == NULL) { - WSP_GGML_PRINT_DEBUG("%s: no unused context found\n", __func__); + GGML_PRINT_DEBUG("%s: no unused context found\n", __func__); - wsp_ggml_critical_section_end(); + ggml_critical_section_end(); return NULL; } - const size_t mem_size = (params.mem_size + WSP_GGML_MEM_ALIGN - 1) & ~(WSP_GGML_MEM_ALIGN - 1); + // allow to call ggml_init with 0 size + if (params.mem_size == 0) { + params.mem_size = GGML_MEM_ALIGN; + } + + const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN); - *ctx = (struct wsp_ggml_context) { + *ctx = (struct ggml_context) { /*.mem_size =*/ mem_size, - /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : WSP_GGML_ALIGNED_MALLOC(mem_size), + /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.no_alloc =*/ params.no_alloc, /*.no_alloc_save =*/ params.no_alloc, @@ -4376,32 +4615,32 @@ struct wsp_ggml_context * wsp_ggml_init(struct wsp_ggml_init_params params) { /*.scratch_save =*/ { 0, 0, NULL, }, }; - WSP_GGML_ASSERT(ctx->mem_buffer != NULL); + GGML_ASSERT(ctx->mem_buffer != NULL); - wsp_ggml_assert_aligned(ctx->mem_buffer); + ggml_assert_aligned(ctx->mem_buffer); - WSP_GGML_PRINT_DEBUG("%s: context initialized\n", __func__); + GGML_PRINT_DEBUG("%s: context initialized\n", __func__); - wsp_ggml_critical_section_end(); + ggml_critical_section_end(); return ctx; } -void wsp_ggml_free(struct wsp_ggml_context * ctx) { +void ggml_free(struct ggml_context * ctx) { // make this function thread safe - wsp_ggml_critical_section_start(); + ggml_critical_section_start(); bool found = false; - for (int i = 0; i < WSP_GGML_MAX_CONTEXTS; i++) { + for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { if (&g_state.contexts[i].context == ctx) { g_state.contexts[i].used = false; - WSP_GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", - __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); + GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n", + __func__, i, ggml_used_mem(ctx)); if (ctx->mem_buffer_owned) { - WSP_GGML_ALIGNED_FREE(ctx->mem_buffer); + GGML_ALIGNED_FREE(ctx->mem_buffer); } found = true; @@ -4410,17 +4649,17 @@ void wsp_ggml_free(struct wsp_ggml_context * ctx) { } if (!found) { - WSP_GGML_PRINT_DEBUG("%s: context not found\n", __func__); + GGML_PRINT_DEBUG("%s: context not found\n", __func__); } - wsp_ggml_critical_section_end(); + ggml_critical_section_end(); } -size_t wsp_ggml_used_mem(const struct wsp_ggml_context * ctx) { +size_t ggml_used_mem(const struct ggml_context * ctx) { return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size; } -size_t wsp_ggml_set_scratch(struct wsp_ggml_context * ctx, struct wsp_ggml_scratch scratch) { +size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) { const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0; ctx->scratch = scratch; @@ -4428,30 +4667,36 @@ size_t wsp_ggml_set_scratch(struct wsp_ggml_context * ctx, struct wsp_ggml_scrat return result; } -void wsp_ggml_set_no_alloc(struct wsp_ggml_context * ctx, bool no_alloc) { +bool ggml_get_no_alloc(struct ggml_context * ctx) { + return ctx->no_alloc; +} + +void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) { ctx->no_alloc = no_alloc; } -void * wsp_ggml_get_mem_buffer(const struct wsp_ggml_context * ctx) { +void * ggml_get_mem_buffer(const struct ggml_context * ctx) { return ctx->mem_buffer; } -size_t wsp_ggml_get_mem_size(const struct wsp_ggml_context * ctx) { +size_t ggml_get_mem_size(const struct ggml_context * ctx) { return ctx->mem_size; } -size_t wsp_ggml_get_max_tensor_size(const struct wsp_ggml_context * ctx) { +size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { size_t max_size = 0; - struct wsp_ggml_object * obj = ctx->objects_begin; + struct ggml_object * obj = ctx->objects_begin; while (obj != NULL) { - struct wsp_ggml_tensor * tensor = (struct wsp_ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs); + if (obj->type == GGML_OBJECT_TENSOR) { + struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs); - const size_t size = wsp_ggml_nbytes(tensor); + const size_t size = ggml_nbytes(tensor); - if (max_size < size) { - max_size = size; + if (max_size < size) { + max_size = size; + } } obj = obj->next; @@ -4465,7 +4710,7 @@ size_t wsp_ggml_get_max_tensor_size(const struct wsp_ggml_context * ctx) { // this is an error prone process, but it is necessary to support inplace // operators when using scratch buffers // TODO: implement a better way -void wsp_ggml_scratch_save(struct wsp_ggml_context * ctx) { +static void ggml_scratch_save(struct ggml_context * ctx) { // this is needed to allow opt tensors to store their data // TODO: again, need to find a better way ctx->no_alloc_save = ctx->no_alloc; @@ -4475,7 +4720,7 @@ void wsp_ggml_scratch_save(struct wsp_ggml_context * ctx) { ctx->scratch.data = NULL; } -void wsp_ggml_scratch_load(struct wsp_ggml_context * ctx) { +static void ggml_scratch_load(struct ggml_context * ctx) { ctx->no_alloc = ctx->no_alloc_save; ctx->scratch = ctx->scratch_save; @@ -4483,75 +4728,35 @@ void wsp_ggml_scratch_load(struct wsp_ggml_context * ctx) { //////////////////////////////////////////////////////////////////////////////// -struct wsp_ggml_tensor * wsp_ggml_new_tensor_impl( - struct wsp_ggml_context * ctx, - enum wsp_ggml_type type, - int n_dims, - const int64_t* ne, - void* data) { +static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) { // always insert objects at the end of the context's memory pool - struct wsp_ggml_object * obj_cur = ctx->objects_end; + struct ggml_object * obj_cur = ctx->objects_end; const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs; const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size; const size_t cur_end = cur_offs + cur_size; - size_t size_needed = 0; - - if (data == NULL && !ctx->no_alloc) { - size_needed += WSP_GGML_TYPE_SIZE[type]*(ne[0]/WSP_GGML_BLCK_SIZE[type]); - for (int i = 1; i < n_dims; i++) { - size_needed *= ne[i]; - } - // align to WSP_GGML_MEM_ALIGN - size_needed = ((size_needed + WSP_GGML_MEM_ALIGN - 1)/WSP_GGML_MEM_ALIGN)*WSP_GGML_MEM_ALIGN; - } + // align to GGML_MEM_ALIGN + size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN); char * const mem_buffer = ctx->mem_buffer; - struct wsp_ggml_object * const obj_new = (struct wsp_ggml_object *)(mem_buffer + cur_end); - - if (ctx->scratch.data == NULL || data != NULL) { - size_needed += WSP_GGML_TENSOR_SIZE; - - if (cur_end + size_needed + WSP_GGML_OBJECT_SIZE > ctx->mem_size) { - WSP_GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", - __func__, cur_end + size_needed + WSP_GGML_OBJECT_SIZE, ctx->mem_size); - assert(false); - return NULL; - } - - *obj_new = (struct wsp_ggml_object) { - .offs = cur_end + WSP_GGML_OBJECT_SIZE, - .size = size_needed, - .next = NULL, - }; - } else { - if (ctx->scratch.offs + size_needed > ctx->scratch.size) { - WSP_GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", - __func__, ctx->scratch.offs + size_needed, ctx->scratch.size); - assert(false); - return NULL; - } - - if (cur_end + WSP_GGML_TENSOR_SIZE + WSP_GGML_OBJECT_SIZE > ctx->mem_size) { - WSP_GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", - __func__, cur_end + WSP_GGML_TENSOR_SIZE + WSP_GGML_OBJECT_SIZE, ctx->mem_size); - assert(false); - return NULL; - } + struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); - data = (char * const) ctx->scratch.data + ctx->scratch.offs; - - *obj_new = (struct wsp_ggml_object) { - .offs = cur_end + WSP_GGML_OBJECT_SIZE, - .size = WSP_GGML_TENSOR_SIZE, - .next = NULL, - }; + if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { + GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", + __func__, cur_end + size_needed, ctx->mem_size); + assert(false); + return NULL; + } - //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed); + *obj_new = (struct ggml_object) { + .offs = cur_end + GGML_OBJECT_SIZE, + .size = size_needed, + .next = NULL, + .type = type, + }; - ctx->scratch.offs += size_needed; - } + ggml_assert_aligned(mem_buffer + obj_new->offs); if (obj_cur != NULL) { obj_cur->next = obj_new; @@ -4564,42 +4769,96 @@ struct wsp_ggml_tensor * wsp_ggml_new_tensor_impl( //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size); - struct wsp_ggml_tensor * const result = (struct wsp_ggml_tensor *)(mem_buffer + obj_new->offs); + return obj_new; +} + +static struct ggml_tensor * ggml_new_tensor_impl( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int64_t * ne, + struct ggml_tensor * view_src, + size_t view_offs) { + + assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); + + // find the base tensor and absolute offset + if (view_src != NULL && view_src->view_src != NULL) { + view_offs += view_src->view_offs; + view_src = view_src->view_src; + } + + size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type)); + for (int i = 1; i < n_dims; i++) { + data_size *= ne[i]; + } + + GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src)); + + void * data = view_src != NULL ? view_src->data : NULL; + if (data != NULL) { + data = (char *) data + view_offs; + } + + size_t obj_alloc_size = 0; + + if (view_src == NULL && !ctx->no_alloc) { + if (ctx->scratch.data != NULL) { + // allocate tensor data in the scratch buffer + if (ctx->scratch.offs + data_size > ctx->scratch.size) { + GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", + __func__, ctx->scratch.offs + data_size, ctx->scratch.size); + assert(false); + return NULL; + } + + data = (char * const) ctx->scratch.data + ctx->scratch.offs; + + ctx->scratch.offs += data_size; + } else { + // allocate tensor data in the context's memory pool + obj_alloc_size = data_size; + } + } + + struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size); - wsp_ggml_assert_aligned(result); + // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here - *result = (struct wsp_ggml_tensor) { + struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs); + + *result = (struct ggml_tensor) { /*.type =*/ type, - /*.backend =*/ WSP_GGML_BACKEND_CPU, + /*.backend =*/ GGML_BACKEND_CPU, /*.n_dims =*/ n_dims, /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, - /*.op =*/ WSP_GGML_OP_NONE, + /*.op =*/ GGML_OP_NONE, + /*.op_params =*/ { 0 }, /*.is_param =*/ false, /*.grad =*/ NULL, - /*.src0 =*/ NULL, - /*.src1 =*/ NULL, - /*.opt =*/ { NULL }, - /*.n_tasks =*/ 0, + /*.src =*/ { NULL }, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, - /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, + /*.view_src =*/ view_src, + /*.view_offs =*/ view_offs, + /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, - /*.pad =*/ { 0 }, + /*.padding =*/ { 0 }, }; // TODO: this should not be needed as long as we don't rely on aligned SIMD loads - //wsp_ggml_assert_aligned(result->data); + //ggml_assert_aligned(result->data); for (int i = 0; i < n_dims; i++) { result->ne[i] = ne[i]; } - result->nb[0] = WSP_GGML_TYPE_SIZE[type]; - result->nb[1] = result->nb[0]*(result->ne[0]/WSP_GGML_BLCK_SIZE[type]); - for (int i = 2; i < WSP_GGML_MAX_DIMS; i++) { + result->nb[0] = ggml_type_size(type); + result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type)); + for (int i = 2; i < GGML_MAX_DIMS; i++) { result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; } @@ -4608,348 +4867,369 @@ struct wsp_ggml_tensor * wsp_ggml_new_tensor_impl( return result; } -struct wsp_ggml_tensor * wsp_ggml_new_tensor( - struct wsp_ggml_context * ctx, - enum wsp_ggml_type type, - int n_dims, - const int64_t * ne) { - return wsp_ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL); +struct ggml_tensor * ggml_new_tensor( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int64_t * ne) { + return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0); } -struct wsp_ggml_tensor * wsp_ggml_new_tensor_1d( - struct wsp_ggml_context * ctx, - enum wsp_ggml_type type, +struct ggml_tensor * ggml_new_tensor_1d( + struct ggml_context * ctx, + enum ggml_type type, int64_t ne0) { - return wsp_ggml_new_tensor(ctx, type, 1, &ne0); + return ggml_new_tensor(ctx, type, 1, &ne0); } -struct wsp_ggml_tensor * wsp_ggml_new_tensor_2d( - struct wsp_ggml_context * ctx, - enum wsp_ggml_type type, +struct ggml_tensor * ggml_new_tensor_2d( + struct ggml_context * ctx, + enum ggml_type type, int64_t ne0, int64_t ne1) { const int64_t ne[2] = { ne0, ne1 }; - return wsp_ggml_new_tensor(ctx, type, 2, ne); + return ggml_new_tensor(ctx, type, 2, ne); } -struct wsp_ggml_tensor * wsp_ggml_new_tensor_3d( - struct wsp_ggml_context * ctx, - enum wsp_ggml_type type, +struct ggml_tensor * ggml_new_tensor_3d( + struct ggml_context * ctx, + enum ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) { const int64_t ne[3] = { ne0, ne1, ne2 }; - return wsp_ggml_new_tensor(ctx, type, 3, ne); + return ggml_new_tensor(ctx, type, 3, ne); } -struct wsp_ggml_tensor * wsp_ggml_new_tensor_4d( - struct wsp_ggml_context * ctx, - enum wsp_ggml_type type, +struct ggml_tensor * ggml_new_tensor_4d( + struct ggml_context * ctx, + enum ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; - return wsp_ggml_new_tensor(ctx, type, 4, ne); + return ggml_new_tensor(ctx, type, 4, ne); } -struct wsp_ggml_tensor * wsp_ggml_new_i32(struct wsp_ggml_context * ctx, int32_t value) { - wsp_ggml_scratch_save(ctx); +struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { + ggml_scratch_save(ctx); - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 1); + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); - wsp_ggml_scratch_load(ctx); + ggml_scratch_load(ctx); - wsp_ggml_set_i32(result, value); + ggml_set_i32(result, value); return result; } -struct wsp_ggml_tensor * wsp_ggml_new_f32(struct wsp_ggml_context * ctx, float value) { - wsp_ggml_scratch_save(ctx); +struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { + ggml_scratch_save(ctx); - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, 1); + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - wsp_ggml_scratch_load(ctx); + ggml_scratch_load(ctx); - wsp_ggml_set_f32(result, value); + ggml_set_f32(result, value); return result; } -struct wsp_ggml_tensor * wsp_ggml_dup_tensor(struct wsp_ggml_context * ctx, const struct wsp_ggml_tensor * src) { - return wsp_ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL); +struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) { + return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne); +} + +static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { + GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings + assert(params_size <= GGML_MAX_OP_PARAMS); + memcpy(tensor->op_params, params, params_size); +} + +static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); + return ((const int32_t *)(tensor->op_params))[i]; +} + +static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); + ((int32_t *)(tensor->op_params))[i] = value; } -struct wsp_ggml_tensor * wsp_ggml_set_zero(struct wsp_ggml_tensor * tensor) { - memset(tensor->data, 0, wsp_ggml_nbytes(tensor)); +struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { + memset(tensor->data, 0, ggml_nbytes(tensor)); return tensor; } -struct wsp_ggml_tensor * wsp_ggml_set_i32 (struct wsp_ggml_tensor * tensor, int32_t value) { - const int n = wsp_ggml_nrows(tensor); +struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { + const int n = ggml_nrows(tensor); const int nc = tensor->ne[0]; const size_t n1 = tensor->nb[1]; char * const data = tensor->data; switch (tensor->type) { - case WSP_GGML_TYPE_I8: + case GGML_TYPE_I8: { assert(tensor->nb[0] == sizeof(int8_t)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); + ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); } } break; - case WSP_GGML_TYPE_I16: + case GGML_TYPE_I16: { assert(tensor->nb[0] == sizeof(int16_t)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); + ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); } } break; - case WSP_GGML_TYPE_I32: + case GGML_TYPE_I32: { assert(tensor->nb[0] == sizeof(int32_t)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); + ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); } } break; - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - assert(tensor->nb[0] == sizeof(wsp_ggml_fp16_t)); + assert(tensor->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_set_f16(nc, (wsp_ggml_fp16_t *)(data + i*n1), value); + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); } } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { assert(tensor->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_set_f32(nc, (float *)(data + i*n1), value); + ggml_vec_set_f32(nc, (float *)(data + i*n1), value); } } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } return tensor; } -struct wsp_ggml_tensor * wsp_ggml_set_f32(struct wsp_ggml_tensor * tensor, float value) { - const int n = wsp_ggml_nrows(tensor); +struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { + const int n = ggml_nrows(tensor); const int nc = tensor->ne[0]; const size_t n1 = tensor->nb[1]; char * const data = tensor->data; switch (tensor->type) { - case WSP_GGML_TYPE_I8: + case GGML_TYPE_I8: { assert(tensor->nb[0] == sizeof(int8_t)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); + ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value); } } break; - case WSP_GGML_TYPE_I16: + case GGML_TYPE_I16: { assert(tensor->nb[0] == sizeof(int16_t)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); + ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value); } } break; - case WSP_GGML_TYPE_I32: + case GGML_TYPE_I32: { assert(tensor->nb[0] == sizeof(int32_t)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); + ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value); } } break; - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - assert(tensor->nb[0] == sizeof(wsp_ggml_fp16_t)); + assert(tensor->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_set_f16(nc, (wsp_ggml_fp16_t *)(data + i*n1), value); + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); } } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { assert(tensor->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_set_f32(nc, (float *)(data + i*n1), value); + ggml_vec_set_f32(nc, (float *)(data + i*n1), value); } } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } return tensor; } -int32_t wsp_ggml_get_i32_1d(const struct wsp_ggml_tensor * tensor, int i) { +int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { switch (tensor->type) { - case WSP_GGML_TYPE_I8: + case GGML_TYPE_I8: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); return ((int8_t *)(tensor->data))[i]; } break; - case WSP_GGML_TYPE_I16: + case GGML_TYPE_I16: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); return ((int16_t *)(tensor->data))[i]; } break; - case WSP_GGML_TYPE_I32: + case GGML_TYPE_I32: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); return ((int32_t *)(tensor->data))[i]; } break; - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(wsp_ggml_fp16_t)); - return WSP_GGML_FP16_TO_FP32(((wsp_ggml_fp16_t *)(tensor->data))[i]); + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(float)); + GGML_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } return 0.0f; } -void wsp_ggml_set_i32_1d(const struct wsp_ggml_tensor * tensor, int i, int32_t value) { +void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { switch (tensor->type) { - case WSP_GGML_TYPE_I8: + case GGML_TYPE_I8: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); ((int8_t *)(tensor->data))[i] = value; } break; - case WSP_GGML_TYPE_I16: + case GGML_TYPE_I16: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); ((int16_t *)(tensor->data))[i] = value; } break; - case WSP_GGML_TYPE_I32: + case GGML_TYPE_I32: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); ((int32_t *)(tensor->data))[i] = value; } break; - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(wsp_ggml_fp16_t)); - ((wsp_ggml_fp16_t *)(tensor->data))[i] = WSP_GGML_FP32_TO_FP16(value); + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(float)); + GGML_ASSERT(tensor->nb[0] == sizeof(float)); ((float *)(tensor->data))[i] = value; } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -float wsp_ggml_get_f32_1d(const struct wsp_ggml_tensor * tensor, int i) { +float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { switch (tensor->type) { - case WSP_GGML_TYPE_I8: + case GGML_TYPE_I8: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); return ((int8_t *)(tensor->data))[i]; } break; - case WSP_GGML_TYPE_I16: + case GGML_TYPE_I16: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); return ((int16_t *)(tensor->data))[i]; } break; - case WSP_GGML_TYPE_I32: + case GGML_TYPE_I32: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); return ((int32_t *)(tensor->data))[i]; } break; - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(wsp_ggml_fp16_t)); - return WSP_GGML_FP16_TO_FP32(((wsp_ggml_fp16_t *)(tensor->data))[i]); + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(float)); + GGML_ASSERT(tensor->nb[0] == sizeof(float)); return ((float *)(tensor->data))[i]; } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } return 0.0f; } -void wsp_ggml_set_f32_1d(const struct wsp_ggml_tensor * tensor, int i, float value) { +void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { switch (tensor->type) { - case WSP_GGML_TYPE_I8: + case GGML_TYPE_I8: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); + GGML_ASSERT(tensor->nb[0] == sizeof(int8_t)); ((int8_t *)(tensor->data))[i] = value; } break; - case WSP_GGML_TYPE_I16: + case GGML_TYPE_I16: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); + GGML_ASSERT(tensor->nb[0] == sizeof(int16_t)); ((int16_t *)(tensor->data))[i] = value; } break; - case WSP_GGML_TYPE_I32: + case GGML_TYPE_I32: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); + GGML_ASSERT(tensor->nb[0] == sizeof(int32_t)); ((int32_t *)(tensor->data))[i] = value; } break; - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(wsp_ggml_fp16_t)); - ((wsp_ggml_fp16_t *)(tensor->data))[i] = WSP_GGML_FP32_TO_FP16(value); + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); + ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - WSP_GGML_ASSERT(tensor->nb[0] == sizeof(float)); + GGML_ASSERT(tensor->nb[0] == sizeof(float)); ((float *)(tensor->data))[i] = value; } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -void * wsp_ggml_get_data(const struct wsp_ggml_tensor * tensor) { +void * ggml_get_data(const struct ggml_tensor * tensor) { return tensor->data; } -float * wsp_ggml_get_data_f32(const struct wsp_ggml_tensor * tensor) { - assert(tensor->type == WSP_GGML_TYPE_F32); +float * ggml_get_data_f32(const struct ggml_tensor * tensor) { + assert(tensor->type == GGML_TYPE_F32); return (float *)(tensor->data); } -const char * wsp_ggml_get_name(const struct wsp_ggml_tensor * tensor) { +enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { + GGML_ASSERT(tensor->op == GGML_OP_UNARY); + return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0); +} + +const char * ggml_get_name(const struct ggml_tensor * tensor) { return tensor->name; } -struct wsp_ggml_tensor * wsp_ggml_set_name(struct wsp_ggml_tensor * tensor, const char * name) { +struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) { strncpy(tensor->name, name, sizeof(tensor->name)); tensor->name[sizeof(tensor->name) - 1] = '\0'; return tensor; } -struct wsp_ggml_tensor * wsp_ggml_format_name(struct wsp_ggml_tensor * tensor, const char * fmt, ...) { +struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(tensor->name, sizeof(tensor->name), fmt, args); @@ -4957,29 +5237,30 @@ struct wsp_ggml_tensor * wsp_ggml_format_name(struct wsp_ggml_tensor * tensor, c return tensor; } -struct wsp_ggml_tensor * wsp_ggml_view_tensor( - struct wsp_ggml_context * ctx, - const struct wsp_ggml_tensor * src) { - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); - wsp_ggml_format_name(result, "%s (view)", src->name); +struct ggml_tensor * ggml_view_tensor( + struct ggml_context * ctx, + struct ggml_tensor * src) { + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0); + ggml_format_name(result, "%s (view)", src->name); - result->nb[0] = src->nb[0]; - result->nb[1] = src->nb[1]; - result->nb[2] = src->nb[2]; - result->nb[3] = src->nb[3]; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + result->nb[i] = src->nb[i]; + } return result; } -struct wsp_ggml_tensor * wsp_ggml_get_tensor(struct wsp_ggml_context * ctx, const char * name) { - struct wsp_ggml_object * obj = ctx->objects_begin; +struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) { + struct ggml_object * obj = ctx->objects_begin; char * const mem_buffer = ctx->mem_buffer; while (obj != NULL) { - struct wsp_ggml_tensor * cur = (struct wsp_ggml_tensor *)(mem_buffer + obj->offs); - if (strcmp(cur->name, name) == 0) { - return cur; + if (obj->type == GGML_OBJECT_TENSOR) { + struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs); + if (strcmp(cur->name, name) == 0) { + return cur; + } } obj = obj->next; @@ -4990,11 +5271,11 @@ struct wsp_ggml_tensor * wsp_ggml_get_tensor(struct wsp_ggml_context * ctx, cons //////////////////////////////////////////////////////////////////////////////// -// wsp_ggml_dup +// ggml_dup -struct wsp_ggml_tensor * wsp_ggml_dup_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +static struct ggml_tensor * ggml_dup_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, bool inplace) { bool is_node = false; @@ -5002,76 +5283,79 @@ struct wsp_ggml_tensor * wsp_ggml_dup_impl( is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_DUP; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_DUP; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_dup( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_dup_impl(ctx, a, false); +struct ggml_tensor * ggml_dup( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_dup_impl(ctx, a, false); } -struct wsp_ggml_tensor * wsp_ggml_dup_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_dup_impl(ctx, a, true); +struct ggml_tensor * ggml_dup_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_dup_impl(ctx, a, true); } -// wsp_ggml_add +// ggml_add -struct wsp_ggml_tensor * wsp_ggml_add_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +static struct ggml_tensor * ggml_add_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, bool inplace) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(a, b)); + // TODO: support less-strict constraint + // GGML_ASSERT(ggml_can_repeat(b, a)); + GGML_ASSERT(ggml_can_repeat_rows(b, a)); bool is_node = false; - if (a->grad || b->grad) { + if (!inplace && (a->grad || b->grad)) { + // TODO: support backward pass for broadcasting + GGML_ASSERT(ggml_are_same_shape(a, b)); is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_ADD; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_ADD; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_add( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_add_impl(ctx, a, b, false); +struct ggml_tensor * ggml_add( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_add_impl(ctx, a, b, false); } -struct wsp_ggml_tensor * wsp_ggml_add_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_add_impl(ctx, a, b, true); +struct ggml_tensor * ggml_add_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_add_impl(ctx, a, b, true); } -// wsp_ggml_add1 +// ggml_add1 -struct wsp_ggml_tensor * wsp_ggml_add1_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +static struct ggml_tensor * ggml_add1_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, bool inplace) { - WSP_GGML_ASSERT(wsp_ggml_is_scalar(b)); - WSP_GGML_ASSERT(wsp_ggml_is_padded_1d(a)); + GGML_ASSERT(ggml_is_scalar(b)); + GGML_ASSERT(ggml_is_padded_1d(a)); bool is_node = false; @@ -5079,45 +5363,45 @@ struct wsp_ggml_tensor * wsp_ggml_add1_impl( is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_ADD1; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_ADD1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_add1( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_add1_impl(ctx, a, b, false); +struct ggml_tensor * ggml_add1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_add1_impl(ctx, a, b, false); } -struct wsp_ggml_tensor * wsp_ggml_add1_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_add1_impl(ctx, a, b, true); +struct ggml_tensor * ggml_add1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_add1_impl(ctx, a, b, true); } -// wsp_ggml_acc +// ggml_acc -struct wsp_ggml_tensor * wsp_ggml_acc_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +static struct ggml_tensor * ggml_acc_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, bool inplace) { - WSP_GGML_ASSERT(wsp_ggml_nelements(b) <= wsp_ggml_nelements(a)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(a)); - WSP_GGML_ASSERT(a->type == WSP_GGML_TYPE_F32); - WSP_GGML_ASSERT(b->type == WSP_GGML_TYPE_F32); + GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a)); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(a->type == GGML_TYPE_F32); + GGML_ASSERT(b->type == GGML_TYPE_F32); bool is_node = false; @@ -5125,59 +5409,49 @@ struct wsp_ggml_tensor * wsp_ggml_acc_impl( is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - wsp_ggml_scratch_save(ctx); + int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; + ggml_set_op_params(result, params, sizeof(params)); - struct wsp_ggml_tensor * c = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 5); - - ((int32_t *) c->data)[0] = nb1; - ((int32_t *) c->data)[1] = nb2; - ((int32_t *) c->data)[2] = nb3; - ((int32_t *) c->data)[3] = offset; - ((int32_t *) c->data)[4] = inplace ? 1 : 0; - - wsp_ggml_scratch_load(ctx); - - result->op = WSP_GGML_OP_ACC; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->op = GGML_OP_ACC; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_acc( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +struct ggml_tensor * ggml_acc( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset) { - return wsp_ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); + return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); } -struct wsp_ggml_tensor * wsp_ggml_acc_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +struct ggml_tensor * ggml_acc_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset) { - return wsp_ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true); + return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true); } -// wsp_ggml_sub +// ggml_sub -struct wsp_ggml_tensor * wsp_ggml_sub_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +static struct ggml_tensor * ggml_sub_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, bool inplace) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(a, b)); + GGML_ASSERT(ggml_are_same_shape(a, b)); bool is_node = false; @@ -5185,85 +5459,85 @@ struct wsp_ggml_tensor * wsp_ggml_sub_impl( is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_SUB; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_SUB; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_sub( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_sub_impl(ctx, a, b, false); +struct ggml_tensor * ggml_sub( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_sub_impl(ctx, a, b, false); } -struct wsp_ggml_tensor * wsp_ggml_sub_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_sub_impl(ctx, a, b, true); +struct ggml_tensor * ggml_sub_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_sub_impl(ctx, a, b, true); } -// wsp_ggml_mul +// ggml_mul -struct wsp_ggml_tensor * wsp_ggml_mul_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +static struct ggml_tensor * ggml_mul_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, bool inplace) { // TODO: support less-strict constraint - // WSP_GGML_ASSERT(wsp_ggml_can_repeat(b, a)); - WSP_GGML_ASSERT(wsp_ggml_can_repeat_rows(b, a)); + // GGML_ASSERT(ggml_can_repeat(b, a)); + GGML_ASSERT(ggml_can_repeat_rows(b, a)); bool is_node = false; if (!inplace && (a->grad || b->grad)) { // TODO: support backward pass for broadcasting - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(a, b)); + GGML_ASSERT(ggml_are_same_shape(a, b)); is_node = true; } if (inplace) { - WSP_GGML_ASSERT(is_node == false); + GGML_ASSERT(!is_node); } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_MUL; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_MUL; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_mul( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_mul_impl(ctx, a, b, false); +struct ggml_tensor * ggml_mul( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_mul_impl(ctx, a, b, false); } -struct wsp_ggml_tensor * wsp_ggml_mul_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_mul_impl(ctx, a, b, true); +struct ggml_tensor * ggml_mul_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_mul_impl(ctx, a, b, true); } -// wsp_ggml_div +// ggml_div -struct wsp_ggml_tensor * wsp_ggml_div_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +static struct ggml_tensor * ggml_div_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, bool inplace) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(a, b)); + GGML_ASSERT(ggml_are_same_shape(a, b)); bool is_node = false; @@ -5272,38 +5546,38 @@ struct wsp_ggml_tensor * wsp_ggml_div_impl( } if (inplace) { - WSP_GGML_ASSERT(is_node == false); + GGML_ASSERT(!is_node); } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_DIV; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_DIV; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_div( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_div_impl(ctx, a, b, false); +struct ggml_tensor * ggml_div( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_div_impl(ctx, a, b, false); } -struct wsp_ggml_tensor * wsp_ggml_div_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_div_impl(ctx, a, b, true); +struct ggml_tensor * ggml_div_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_div_impl(ctx, a, b, true); } -// wsp_ggml_sqr +// ggml_sqr -struct wsp_ggml_tensor * wsp_ggml_sqr_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +static struct ggml_tensor * ggml_sqr_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, bool inplace) { bool is_node = false; @@ -5311,33 +5585,32 @@ struct wsp_ggml_tensor * wsp_ggml_sqr_impl( is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_SQR; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_SQR; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_sqr( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_sqr_impl(ctx, a, false); +struct ggml_tensor * ggml_sqr( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sqr_impl(ctx, a, false); } -struct wsp_ggml_tensor * wsp_ggml_sqr_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_sqr_impl(ctx, a, true); +struct ggml_tensor * ggml_sqr_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sqr_impl(ctx, a, true); } -// wsp_ggml_sqrt +// ggml_sqrt -struct wsp_ggml_tensor * wsp_ggml_sqrt_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +static struct ggml_tensor * ggml_sqrt_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, bool inplace) { bool is_node = false; @@ -5345,34 +5618,33 @@ struct wsp_ggml_tensor * wsp_ggml_sqrt_impl( is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_SQRT; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_SQRT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_sqrt( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_sqrt_impl(ctx, a, false); +struct ggml_tensor * ggml_sqrt( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sqrt_impl(ctx, a, false); } -struct wsp_ggml_tensor * wsp_ggml_sqrt_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_sqrt_impl(ctx, a, true); +struct ggml_tensor * ggml_sqrt_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_sqrt_impl(ctx, a, true); } -// wsp_ggml_log +// ggml_log -struct wsp_ggml_tensor * wsp_ggml_log_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +static struct ggml_tensor * ggml_log_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, bool inplace) { bool is_node = false; @@ -5380,55 +5652,53 @@ struct wsp_ggml_tensor * wsp_ggml_log_impl( is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_LOG; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_LOG; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_log( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_log_impl(ctx, a, false); +struct ggml_tensor * ggml_log( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_log_impl(ctx, a, false); } -struct wsp_ggml_tensor * wsp_ggml_log_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_log_impl(ctx, a, true); +struct ggml_tensor * ggml_log_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_log_impl(ctx, a, true); } -// wsp_ggml_sum +// ggml_sum -struct wsp_ggml_tensor * wsp_ggml_sum( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { +struct ggml_tensor * ggml_sum( + struct ggml_context * ctx, + struct ggml_tensor * a) { bool is_node = false; if (a->grad) { is_node = true; } - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_1d(ctx, a->type, 1); + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1); - result->op = WSP_GGML_OP_SUM; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_SUM; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -// wsp_ggml_sum_rows +// ggml_sum_rows -struct wsp_ggml_tensor * wsp_ggml_sum_rows( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { +struct ggml_tensor * ggml_sum_rows( + struct ggml_context * ctx, + struct ggml_tensor * a) { bool is_node = false; if (a->grad) { @@ -5440,70 +5710,67 @@ struct wsp_ggml_tensor * wsp_ggml_sum_rows( ne[i] = a->ne[i]; } - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, a->type, a->n_dims, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne); - result->op = WSP_GGML_OP_SUM_ROWS; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_SUM_ROWS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -// wsp_ggml_mean +// ggml_mean -struct wsp_ggml_tensor * wsp_ggml_mean( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { +struct ggml_tensor * ggml_mean( + struct ggml_context * ctx, + struct ggml_tensor * a) { bool is_node = false; if (a->grad) { - WSP_GGML_ASSERT(false); // TODO: implement + GGML_ASSERT(false); // TODO: implement is_node = true; } - int64_t ne[WSP_GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] }; - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, a->n_dims, ne); + int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne); - result->op = WSP_GGML_OP_MEAN; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_MEAN; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -// wsp_ggml_argmax +// ggml_argmax -struct wsp_ggml_tensor * wsp_ggml_argmax( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - WSP_GGML_ASSERT(wsp_ggml_is_matrix(a)); +struct ggml_tensor * ggml_argmax( + struct ggml_context * ctx, + struct ggml_tensor * a) { + GGML_ASSERT(ggml_is_matrix(a)); bool is_node = false; if (a->grad) { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); is_node = true; } - int64_t ne[WSP_GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 }; - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_I32, a->n_dims, ne); + int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne); - result->op = WSP_GGML_OP_ARGMAX; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_ARGMAX; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -// wsp_ggml_repeat +// ggml_repeat -struct wsp_ggml_tensor * wsp_ggml_repeat( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - WSP_GGML_ASSERT(wsp_ggml_can_repeat(a, b)); +struct ggml_tensor * ggml_repeat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_repeat(a, b)); bool is_node = false; @@ -5511,27 +5778,23 @@ struct wsp_ggml_tensor * wsp_ggml_repeat( is_node = true; } - if (wsp_ggml_are_same_shape(a, b) && !is_node) { - return a; - } - - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); - result->op = WSP_GGML_OP_REPEAT; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_REPEAT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -// wsp_ggml_repeat_back +// ggml_repeat_back -struct wsp_ggml_tensor * wsp_ggml_repeat_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - WSP_GGML_ASSERT(wsp_ggml_can_repeat(b, a)); +struct ggml_tensor * ggml_repeat_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_repeat(b, a)); bool is_node = false; @@ -5539,332 +5802,252 @@ struct wsp_ggml_tensor * wsp_ggml_repeat_back( is_node = true; } - if (wsp_ggml_are_same_shape(a, b) && !is_node) { + if (ggml_are_same_shape(a, b) && !is_node) { return a; } - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); - result->op = WSP_GGML_OP_REPEAT_BACK; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_REPEAT_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -// wsp_ggml_abs +// ggml_concat + +struct ggml_tensor * ggml_concat( + struct ggml_context* ctx, + struct ggml_tensor* a, + struct ggml_tensor* b) { + GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]); -struct wsp_ggml_tensor * wsp_ggml_abs_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - bool inplace) { bool is_node = false; - if (!inplace && (a->grad)) { + if (a->grad || b->grad) { is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]); - result->op = WSP_GGML_OP_ABS; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_CONCAT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_abs( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_abs_impl(ctx, a, false); +// ggml_abs + +struct ggml_tensor * ggml_abs( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_ABS); } -struct wsp_ggml_tensor * wsp_ggml_abs_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_abs_impl(ctx, a, true); +struct ggml_tensor * ggml_abs_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS); } +// ggml_sgn -// wsp_ggml_sgn +struct ggml_tensor * ggml_sgn( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_SGN); +} -struct wsp_ggml_tensor * wsp_ggml_sgn_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - bool inplace) { - bool is_node = false; +struct ggml_tensor * ggml_sgn_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN); +} - if (!inplace && (a->grad)) { - is_node = true; - } - - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); - - result->op = WSP_GGML_OP_SGN; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; +// ggml_neg - return result; -} - -struct wsp_ggml_tensor * wsp_ggml_sgn( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_sgn_impl(ctx, a, false); +struct ggml_tensor * ggml_neg( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_NEG); } -struct wsp_ggml_tensor * wsp_ggml_sgn_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_sgn_impl(ctx, a, true); +struct ggml_tensor * ggml_neg_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG); } -// wsp_ggml_neg - -struct wsp_ggml_tensor * wsp_ggml_neg_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); - - result->op = WSP_GGML_OP_NEG; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; +// ggml_step - return result; -} - -struct wsp_ggml_tensor * wsp_ggml_neg( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_neg_impl(ctx, a, false); +struct ggml_tensor * ggml_step( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_STEP); } -struct wsp_ggml_tensor * wsp_ggml_neg_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_neg_impl(ctx, a, true); +struct ggml_tensor * ggml_step_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP); } -// wsp_ggml_step - -struct wsp_ggml_tensor * wsp_ggml_step_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); - - result->op = WSP_GGML_OP_STEP; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; +// ggml_tanh - return result; -} - -struct wsp_ggml_tensor * wsp_ggml_step( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_step_impl(ctx, a, false); +struct ggml_tensor * ggml_tanh( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_TANH); } -struct wsp_ggml_tensor * wsp_ggml_step_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_step_impl(ctx, a, true); +struct ggml_tensor * ggml_tanh_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH); } -// wsp_ggml_tanh - -struct wsp_ggml_tensor * wsp_ggml_tanh_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); - - result->op = WSP_GGML_OP_TANH; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; +// ggml_elu - return result; -} - -struct wsp_ggml_tensor * wsp_ggml_tanh( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_tanh_impl(ctx, a, false); +struct ggml_tensor * ggml_elu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_ELU); } -struct wsp_ggml_tensor * wsp_ggml_tanh_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_tanh_impl(ctx, a, true); +struct ggml_tensor * ggml_elu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU); } -// wsp_ggml_elu - -struct wsp_ggml_tensor * wsp_ggml_elu_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - is_node = true; - } - - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); - - result->op = WSP_GGML_OP_ELU; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; +// ggml_relu - return result; -} - -struct wsp_ggml_tensor * wsp_ggml_elu( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_elu_impl(ctx, a, false); +struct ggml_tensor * ggml_relu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_RELU); } -struct wsp_ggml_tensor * wsp_ggml_elu_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_elu_impl(ctx, a, true); +struct ggml_tensor * ggml_relu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU); } -// wsp_ggml_relu +// ggml_gelu -struct wsp_ggml_tensor * wsp_ggml_relu_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - bool inplace) { - bool is_node = false; +struct ggml_tensor * ggml_gelu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_GELU); +} - if (!inplace && (a->grad)) { - is_node = true; - } +struct ggml_tensor * ggml_gelu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU); +} - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); +// ggml_gelu_quick - result->op = WSP_GGML_OP_RELU; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; +struct ggml_tensor * ggml_gelu_quick( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK); +} - return result; +struct ggml_tensor * ggml_gelu_quick_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK); } -struct wsp_ggml_tensor * wsp_ggml_relu( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_relu_impl(ctx, a, false); +// ggml_silu + +struct ggml_tensor * ggml_silu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_SILU); } -struct wsp_ggml_tensor * wsp_ggml_relu_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_relu_impl(ctx, a, true); +struct ggml_tensor * ggml_silu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU); } -// wsp_ggml_gelu +// ggml_silu_back -struct wsp_ggml_tensor * wsp_ggml_gelu_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - bool inplace) { +struct ggml_tensor * ggml_silu_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { bool is_node = false; - if (!inplace && (a->grad)) { + if (a->grad || b->grad) { + // TODO: implement backward is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_GELU; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_SILU_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_gelu( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_gelu_impl(ctx, a, false); -} - -struct wsp_ggml_tensor * wsp_ggml_gelu_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_gelu_impl(ctx, a, true); -} - -// wsp_ggml_gelu_quick +// ggml_norm -struct wsp_ggml_tensor * wsp_ggml_gelu_quick_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +static struct ggml_tensor * ggml_norm_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps, bool inplace) { bool is_node = false; if (!inplace && (a->grad)) { + GGML_ASSERT(false); // TODO: implement backward is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_set_op_params(result, &eps, sizeof(eps)); - result->op = WSP_GGML_OP_GELU_QUICK; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_NORM; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_gelu_quick( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_gelu_quick_impl(ctx, a, false); +struct ggml_tensor * ggml_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps) { + return ggml_norm_impl(ctx, a, eps, false); } -struct wsp_ggml_tensor * wsp_ggml_gelu_quick_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_gelu_quick_impl(ctx, a, true); +struct ggml_tensor * ggml_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps) { + return ggml_norm_impl(ctx, a, eps, true); } -// wsp_ggml_silu +// ggml_rms_norm -struct wsp_ggml_tensor * wsp_ggml_silu_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +static struct ggml_tensor * ggml_rms_norm_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps, bool inplace) { bool is_node = false; @@ -5872,148 +6055,104 @@ struct wsp_ggml_tensor * wsp_ggml_silu_impl( is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_SILU; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + ggml_set_op_params(result, &eps, sizeof(eps)); + + result->op = GGML_OP_RMS_NORM; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_silu( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_silu_impl(ctx, a, false); +struct ggml_tensor * ggml_rms_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps) { + return ggml_rms_norm_impl(ctx, a, eps, false); } -struct wsp_ggml_tensor * wsp_ggml_silu_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_silu_impl(ctx, a, true); +struct ggml_tensor * ggml_rms_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps) { + return ggml_rms_norm_impl(ctx, a, eps, true); } -// wsp_ggml_silu_back +// ggml_rms_norm_back -struct wsp_ggml_tensor * wsp_ggml_silu_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { +struct ggml_tensor * ggml_rms_norm_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + float eps) { bool is_node = false; - if (a->grad || b->grad) { + if (a->grad) { // TODO: implement backward is_node = true; } - struct wsp_ggml_tensor * result = wsp_ggml_dup_tensor(ctx, a); - - result->op = WSP_GGML_OP_SILU_BACK; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - - return result; -} - -// wsp_ggml_norm - -struct wsp_ggml_tensor * wsp_ggml_norm_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - bool inplace) { - bool is_node = false; - - if (!inplace && (a->grad)) { - WSP_GGML_ASSERT(false); // TODO: implement backward - is_node = true; - } + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + ggml_set_op_params(result, &eps, sizeof(eps)); - result->op = WSP_GGML_OP_NORM; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; // TODO: maybe store epsilon here? + result->op = GGML_OP_RMS_NORM_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_norm( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_norm_impl(ctx, a, false); -} +// ggml_group_norm -struct wsp_ggml_tensor * wsp_ggml_norm_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_norm_impl(ctx, a, true); -} +static struct ggml_tensor * ggml_group_norm_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups, + bool inplace) { -struct wsp_ggml_tensor * wsp_ggml_rms_norm_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - bool inplace) { bool is_node = false; - if (!inplace && (a->grad)) { + GGML_ASSERT(false); // TODO: implement backward is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_RMS_NORM; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; // TODO: maybe store epsilon here? + result->op = GGML_OP_GROUP_NORM; + result->op_params[0] = n_groups; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = NULL; // TODO: maybe store epsilon here? return result; } -struct wsp_ggml_tensor * wsp_ggml_rms_norm( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_rms_norm_impl(ctx, a, false); -} - -struct wsp_ggml_tensor * wsp_ggml_rms_norm_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_rms_norm_impl(ctx, a, true); +struct ggml_tensor * ggml_group_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups) { + return ggml_group_norm_impl(ctx, a, n_groups, false); } -struct wsp_ggml_tensor * wsp_ggml_rms_norm_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - bool is_node = false; - - if (a->grad) { - // TODO: implement backward - is_node = true; - } - - struct wsp_ggml_tensor * result = wsp_ggml_dup_tensor(ctx, a); - - result->op = WSP_GGML_OP_RMS_NORM_BACK; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - - return result; +struct ggml_tensor * ggml_group_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups) { + return ggml_group_norm_impl(ctx, a, n_groups, true); } +// ggml_mul_mat -// wsp_ggml_mul_mat - -struct wsp_ggml_tensor * wsp_ggml_mul_mat( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - WSP_GGML_ASSERT(wsp_ggml_can_mul_mat(a, b)); - WSP_GGML_ASSERT(!wsp_ggml_is_transposed(a)); +struct ggml_tensor * ggml_mul_mat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_mul_mat(a, b)); + GGML_ASSERT(!ggml_is_transposed(a)); bool is_node = false; @@ -6021,25 +6160,25 @@ struct wsp_ggml_tensor * wsp_ggml_mul_mat( is_node = true; } - const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] }; - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); + const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); - result->op = WSP_GGML_OP_MUL_MAT; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_MUL_MAT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -// wsp_ggml_out_prod +// ggml_out_prod -struct wsp_ggml_tensor * wsp_ggml_out_prod( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - WSP_GGML_ASSERT(wsp_ggml_can_out_prod(a, b)); - WSP_GGML_ASSERT(!wsp_ggml_is_transposed(a)); +struct ggml_tensor * ggml_out_prod( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_can_out_prod(a, b)); + GGML_ASSERT(!ggml_is_transposed(a)); bool is_node = false; @@ -6048,25 +6187,25 @@ struct wsp_ggml_tensor * wsp_ggml_out_prod( } const int64_t ne[4] = { a->ne[0], b->ne[0], a->ne[2], b->ne[3] }; - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne); - result->op = WSP_GGML_OP_OUT_PROD; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_OUT_PROD; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -// wsp_ggml_scale +// ggml_scale -struct wsp_ggml_tensor * wsp_ggml_scale_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +static struct ggml_tensor * ggml_scale_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, bool inplace) { - WSP_GGML_ASSERT(wsp_ggml_is_scalar(b)); - WSP_GGML_ASSERT(wsp_ggml_is_padded_1d(a)); + GGML_ASSERT(ggml_is_scalar(b)); + GGML_ASSERT(ggml_is_padded_1d(a)); bool is_node = false; @@ -6074,42 +6213,42 @@ struct wsp_ggml_tensor * wsp_ggml_scale_impl( is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_SCALE; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_SCALE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_scale( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_scale_impl(ctx, a, b, false); +struct ggml_tensor * ggml_scale( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_scale_impl(ctx, a, b, false); } -struct wsp_ggml_tensor * wsp_ggml_scale_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_scale_impl(ctx, a, b, true); +struct ggml_tensor * ggml_scale_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_scale_impl(ctx, a, b, true); } -// wsp_ggml_set +// ggml_set -struct wsp_ggml_tensor * wsp_ggml_set_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +static struct ggml_tensor * ggml_set_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, bool inplace) { - WSP_GGML_ASSERT(wsp_ggml_nelements(a) >= wsp_ggml_nelements(b)); + GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b)); bool is_node = false; @@ -6118,94 +6257,84 @@ struct wsp_ggml_tensor * wsp_ggml_set_impl( } // make a view of the destination - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); - - wsp_ggml_scratch_save(ctx); - - struct wsp_ggml_tensor * c = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 5); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - (( int32_t * ) c->data)[0] = nb1; - (( int32_t * ) c->data)[1] = nb2; - (( int32_t * ) c->data)[2] = nb3; - (( int32_t * ) c->data)[3] = offset; - (( int32_t * ) c->data)[4] = inplace ? 1 : 0; + int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; + ggml_set_op_params(result, params, sizeof(params)); - wsp_ggml_scratch_load(ctx); - - result->op = WSP_GGML_OP_SET; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->op = GGML_OP_SET; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_set( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +struct ggml_tensor * ggml_set( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset) { - return wsp_ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false); + return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false); } -struct wsp_ggml_tensor * wsp_ggml_set_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +struct ggml_tensor * ggml_set_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset) { - return wsp_ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true); + return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true); } -struct wsp_ggml_tensor * wsp_ggml_set_1d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +struct ggml_tensor * ggml_set_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t offset) { - return wsp_ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false); + return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false); } -struct wsp_ggml_tensor * wsp_ggml_set_1d_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +struct ggml_tensor * ggml_set_1d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t offset) { - return wsp_ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true); + return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true); } -struct wsp_ggml_tensor * wsp_ggml_set_2d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +struct ggml_tensor * ggml_set_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t offset) { - return wsp_ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); + return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); } -struct wsp_ggml_tensor * wsp_ggml_set_2d_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +struct ggml_tensor * ggml_set_2d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t offset) { - return wsp_ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); + return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); } -// wsp_ggml_cpy +// ggml_cpy -struct wsp_ggml_tensor * wsp_ggml_cpy_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +static struct ggml_tensor * ggml_cpy_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, bool inplace) { - WSP_GGML_ASSERT(wsp_ggml_nelements(a) == wsp_ggml_nelements(b)); + GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); bool is_node = false; @@ -6214,40 +6343,40 @@ struct wsp_ggml_tensor * wsp_ggml_cpy_impl( } // make a view of the destination - struct wsp_ggml_tensor * result = wsp_ggml_view_tensor(ctx, b); + struct ggml_tensor * result = ggml_view_tensor(ctx, b); if (strlen(b->name) > 0) { - wsp_ggml_format_name(result, "%s (copy of %s)", b->name, a->name); + ggml_format_name(result, "%s (copy of %s)", b->name, a->name); } else { - wsp_ggml_format_name(result, "%s (copy)", a->name); + ggml_format_name(result, "%s (copy)", a->name); } - result->op = WSP_GGML_OP_CPY; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_CPY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_cpy( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_cpy_impl(ctx, a, b, false); +struct ggml_tensor * ggml_cpy( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_cpy_impl(ctx, a, b, false); } -struct wsp_ggml_tensor * wsp_ggml_cpy_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_cpy_impl(ctx, a, b, true); +struct ggml_tensor * ggml_cpy_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_cpy_impl(ctx, a, b, true); } -// wsp_ggml_cont +// ggml_cont -struct wsp_ggml_tensor * wsp_ggml_cont_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +static struct ggml_tensor * ggml_cont_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, bool inplace) { bool is_node = false; @@ -6255,38 +6384,37 @@ struct wsp_ggml_tensor * wsp_ggml_cont_impl( is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); - wsp_ggml_format_name(result, "%s (cont)", a->name); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + ggml_format_name(result, "%s (cont)", a->name); - result->op = WSP_GGML_OP_CONT; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_CONT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_cont( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_cont_impl(ctx, a, false); +struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_cont_impl(ctx, a, false); } -struct wsp_ggml_tensor * wsp_ggml_cont_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_cont_impl(ctx, a, true); +struct ggml_tensor * ggml_cont_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_cont_impl(ctx, a, true); } -// wsp_ggml_reshape +// ggml_reshape -struct wsp_ggml_tensor * wsp_ggml_reshape( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(a)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(b)); - WSP_GGML_ASSERT(wsp_ggml_nelements(a) == wsp_ggml_nelements(b)); +struct ggml_tensor * ggml_reshape( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_is_contiguous(b)); + GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); bool is_node = false; @@ -6296,26 +6424,25 @@ struct wsp_ggml_tensor * wsp_ggml_reshape( if (b->grad) { // gradient propagation is not supported - //WSP_GGML_ASSERT(false); + //GGML_ASSERT(false); } - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); - wsp_ggml_format_name(result, "%s (reshaped)", a->name); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0); + ggml_format_name(result, "%s (reshaped)", a->name); - result->op = WSP_GGML_OP_RESHAPE; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_reshape_1d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_reshape_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(a)); - WSP_GGML_ASSERT(wsp_ggml_nelements(a) == ne0); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_nelements(a) == ne0); bool is_node = false; @@ -6324,24 +6451,23 @@ struct wsp_ggml_tensor * wsp_ggml_reshape_1d( } const int64_t ne[1] = { ne0 }; - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data); - wsp_ggml_format_name(result, "%s (reshaped)", a->name); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0); + ggml_format_name(result, "%s (reshaped)", a->name); - result->op = WSP_GGML_OP_RESHAPE; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_reshape_2d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_reshape_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0, int64_t ne1) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(a)); - WSP_GGML_ASSERT(wsp_ggml_nelements(a) == ne0*ne1); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_nelements(a) == ne0*ne1); bool is_node = false; @@ -6350,25 +6476,24 @@ struct wsp_ggml_tensor * wsp_ggml_reshape_2d( } const int64_t ne[2] = { ne0, ne1 }; - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data); - wsp_ggml_format_name(result, "%s (reshaped)", a->name); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0); + ggml_format_name(result, "%s (reshaped)", a->name); - result->op = WSP_GGML_OP_RESHAPE; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_reshape_3d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_reshape_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0, int64_t ne1, int64_t ne2) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(a)); - WSP_GGML_ASSERT(wsp_ggml_nelements(a) == ne0*ne1*ne2); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2); bool is_node = false; @@ -6377,27 +6502,25 @@ struct wsp_ggml_tensor * wsp_ggml_reshape_3d( } const int64_t ne[3] = { ne0, ne1, ne2 }; - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data); - wsp_ggml_format_name(result, "%s (reshaped)", a->name); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0); + ggml_format_name(result, "%s (reshaped)", a->name); - result->op = WSP_GGML_OP_RESHAPE; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } - -struct wsp_ggml_tensor * wsp_ggml_reshape_4d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_reshape_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(a)); - WSP_GGML_ASSERT(wsp_ggml_nelements(a) == ne0*ne1*ne2*ne3); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3); bool is_node = false; @@ -6406,23 +6529,21 @@ struct wsp_ggml_tensor * wsp_ggml_reshape_4d( } const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data); - wsp_ggml_format_name(result, "%s (reshaped)", a->name); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0); + ggml_format_name(result, "%s (reshaped)", a->name); - result->op = WSP_GGML_OP_RESHAPE; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_RESHAPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -// wsp_ggml_view_1d - -struct wsp_ggml_tensor * wsp_ggml_view_1d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - int64_t ne0, +static struct ggml_tensor * ggml_view_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_dims, + const int64_t * ne, size_t offset) { bool is_node = false; @@ -6431,73 +6552,57 @@ struct wsp_ggml_tensor * wsp_ggml_view_1d( is_node = true; } - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); - wsp_ggml_format_name(result, "%s (view)", a->name); - - wsp_ggml_scratch_save(ctx); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset); + ggml_format_name(result, "%s (view)", a->name); - struct wsp_ggml_tensor * offs = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 2); - wsp_ggml_set_name(offs, "offset"); - memcpy(offs->data, &offset, 2*sizeof(int32_t)); + ggml_set_op_params(result, &offset, sizeof(offset)); - wsp_ggml_scratch_load(ctx); - - result->op = WSP_GGML_OP_VIEW; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = offs; + result->op = GGML_OP_VIEW; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -// wsp_ggml_view_2d +// ggml_view_1d -struct wsp_ggml_tensor * wsp_ggml_view_2d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_view_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0, - int64_t ne1, - size_t nb1, size_t offset) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } + struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset); - const int64_t ne[WSP_GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; + return result; +} - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); - wsp_ggml_format_name(result, "%s (view)", a->name); +// ggml_view_2d - wsp_ggml_scratch_save(ctx); +struct ggml_tensor * ggml_view_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + size_t nb1, + size_t offset) { - struct wsp_ggml_tensor * offs = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 2); - wsp_ggml_set_name(offs, "offset"); - memcpy(offs->data, &offset, 2*sizeof(int32_t)); + const int64_t ne[2] = { ne0, ne1 }; - wsp_ggml_scratch_load(ctx); + struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset); result->nb[1] = nb1; result->nb[2] = result->nb[1]*ne1; result->nb[3] = result->nb[2]; - result->op = WSP_GGML_OP_VIEW; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = offs; - return result; } -// wsp_ggml_view_3d +// ggml_view_3d -struct wsp_ggml_tensor * wsp_ggml_view_3d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0, int64_t ne1, int64_t ne2, @@ -6505,43 +6610,22 @@ struct wsp_ggml_tensor * wsp_ggml_view_3d( size_t nb2, size_t offset) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - - const int64_t ne[WSP_GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; - - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); - wsp_ggml_format_name(result, "%s (view)", a->name); - - wsp_ggml_scratch_save(ctx); - - struct wsp_ggml_tensor * offs = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 2); - wsp_ggml_set_name(offs, "offset"); - memcpy(offs->data, &offset, 2*sizeof(int32_t)); + const int64_t ne[3] = { ne0, ne1, ne2 }; - wsp_ggml_scratch_load(ctx); + struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset); result->nb[1] = nb1; result->nb[2] = nb2; result->nb[3] = result->nb[2]*ne2; - result->op = WSP_GGML_OP_VIEW; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = offs; - return result; } -// wsp_ggml_view_4d +// ggml_view_4d -struct wsp_ggml_tensor * wsp_ggml_view_4d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_view_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0, int64_t ne1, int64_t ne2, @@ -6551,58 +6635,37 @@ struct wsp_ggml_tensor * wsp_ggml_view_4d( size_t nb3, size_t offset) { - bool is_node = false; - - if (a->grad) { - is_node = true; - } - - const int64_t ne[WSP_GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; - - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); - wsp_ggml_format_name(result, "%s (view)", a->name); - - wsp_ggml_scratch_save(ctx); - - struct wsp_ggml_tensor * offs = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 2); - wsp_ggml_set_name(offs, "offset"); - memcpy(offs->data, &offset, 2*sizeof(int32_t)); + const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; - wsp_ggml_scratch_load(ctx); + struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset); result->nb[1] = nb1; result->nb[2] = nb2; result->nb[3] = nb3; - result->op = WSP_GGML_OP_VIEW; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = offs; - return result; } -// wsp_ggml_permute +// ggml_permute -struct wsp_ggml_tensor * wsp_ggml_permute( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_permute( + struct ggml_context * ctx, + struct ggml_tensor * a, int axis0, int axis1, int axis2, int axis3) { - WSP_GGML_ASSERT(axis0 >= 0 && axis0 < WSP_GGML_MAX_DIMS); - WSP_GGML_ASSERT(axis1 >= 0 && axis1 < WSP_GGML_MAX_DIMS); - WSP_GGML_ASSERT(axis2 >= 0 && axis2 < WSP_GGML_MAX_DIMS); - WSP_GGML_ASSERT(axis3 >= 0 && axis3 < WSP_GGML_MAX_DIMS); - - WSP_GGML_ASSERT(axis0 != axis1); - WSP_GGML_ASSERT(axis0 != axis2); - WSP_GGML_ASSERT(axis0 != axis3); - WSP_GGML_ASSERT(axis1 != axis2); - WSP_GGML_ASSERT(axis1 != axis3); - WSP_GGML_ASSERT(axis2 != axis3); + GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS); + GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS); + GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS); + GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS); + + GGML_ASSERT(axis0 != axis1); + GGML_ASSERT(axis0 != axis2); + GGML_ASSERT(axis0 != axis3); + GGML_ASSERT(axis1 != axis2); + GGML_ASSERT(axis1 != axis3); + GGML_ASSERT(axis2 != axis3); bool is_node = false; @@ -6610,11 +6673,11 @@ struct wsp_ggml_tensor * wsp_ggml_permute( is_node = true; } - struct wsp_ggml_tensor * result = wsp_ggml_view_tensor(ctx, a); - wsp_ggml_format_name(result, "%s (permuted)", a->name); + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + ggml_format_name(result, "%s (permuted)", a->name); - int ne[WSP_GGML_MAX_DIMS]; - int nb[WSP_GGML_MAX_DIMS]; + int ne[GGML_MAX_DIMS]; + int nb[GGML_MAX_DIMS]; ne[axis0] = a->ne[0]; ne[axis1] = a->ne[1]; @@ -6636,42 +6699,29 @@ struct wsp_ggml_tensor * wsp_ggml_permute( result->nb[2] = nb[2]; result->nb[3] = nb[3]; - result->op = WSP_GGML_OP_PERMUTE; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_PERMUTE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; - if (is_node) { - wsp_ggml_scratch_save(ctx); - - struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 4); - - ((int32_t *) b->data)[0] = axis0; - ((int32_t *) b->data)[1] = axis1; - ((int32_t *) b->data)[2] = axis2; - ((int32_t *) b->data)[3] = axis3; - - wsp_ggml_scratch_load(ctx); - - result->opt[0] = b; - } + int32_t params[] = { axis0, axis1, axis2, axis3 }; + ggml_set_op_params(result, params, sizeof(params)); return result; } -// wsp_ggml_transpose +// ggml_transpose -struct wsp_ggml_tensor * wsp_ggml_transpose( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { +struct ggml_tensor * ggml_transpose( + struct ggml_context * ctx, + struct ggml_tensor * a) { bool is_node = false; if (a->grad) { is_node = true; } - struct wsp_ggml_tensor * result = wsp_ggml_view_tensor(ctx, a); - wsp_ggml_format_name(result, "%s (transposed)", a->name); + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + ggml_format_name(result, "%s (transposed)", a->name); result->ne[0] = a->ne[1]; result->ne[1] = a->ne[0]; @@ -6679,21 +6729,20 @@ struct wsp_ggml_tensor * wsp_ggml_transpose( result->nb[0] = a->nb[1]; result->nb[1] = a->nb[0]; - result->op = WSP_GGML_OP_TRANSPOSE; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_TRANSPOSE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -// wsp_ggml_get_rows +// ggml_get_rows -struct wsp_ggml_tensor * wsp_ggml_get_rows( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - WSP_GGML_ASSERT(wsp_ggml_is_matrix(a) && wsp_ggml_is_vector(b) && b->type == WSP_GGML_TYPE_I32); +struct ggml_tensor * ggml_get_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); bool is_node = false; @@ -6702,26 +6751,26 @@ struct wsp_ggml_tensor * wsp_ggml_get_rows( } // TODO: implement non F32 return - //struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, a->ne[0], b->ne[0]); + //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]); - result->op = WSP_GGML_OP_GET_ROWS; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_GET_ROWS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -// wsp_ggml_get_rows_back +// ggml_get_rows_back -struct wsp_ggml_tensor * wsp_ggml_get_rows_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - struct wsp_ggml_tensor * c) { - WSP_GGML_ASSERT(wsp_ggml_is_matrix(a) && wsp_ggml_is_vector(b) && b->type == WSP_GGML_TYPE_I32); - WSP_GGML_ASSERT(wsp_ggml_is_matrix(c) && (a->ne[0] == c->ne[0])); +struct ggml_tensor * ggml_get_rows_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c) { + GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0])); bool is_node = false; @@ -6730,24 +6779,24 @@ struct wsp_ggml_tensor * wsp_ggml_get_rows_back( } // TODO: implement non F32 return - //struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, c->ne[0], c->ne[1]); + //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]); + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]); - result->op = WSP_GGML_OP_GET_ROWS_BACK; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->op = GGML_OP_GET_ROWS_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; } -// wsp_ggml_diag +// ggml_diag -struct wsp_ggml_tensor * wsp_ggml_diag( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - WSP_GGML_ASSERT(a->ne[1] == 1); +struct ggml_tensor * ggml_diag( + struct ggml_context * ctx, + struct ggml_tensor * a) { + GGML_ASSERT(a->ne[1] == 1); bool is_node = false; if (a->grad) { @@ -6755,22 +6804,21 @@ struct wsp_ggml_tensor * wsp_ggml_diag( } const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] }; - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne); - result->op = WSP_GGML_OP_DIAG; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_DIAG; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -// wsp_ggml_diag_mask_inf +// ggml_diag_mask_inf -struct wsp_ggml_tensor * wsp_ggml_diag_mask_inf_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +static struct ggml_tensor * ggml_diag_mask_inf_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past, bool inplace) { bool is_node = false; @@ -6779,45 +6827,37 @@ struct wsp_ggml_tensor * wsp_ggml_diag_mask_inf_impl( is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); - - wsp_ggml_scratch_save(ctx); - - struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 2); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = inplace ? 1 : 0; + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - wsp_ggml_scratch_load(ctx); + int32_t params[] = { n_past }; + ggml_set_op_params(result, params, sizeof(params)); - result->op = WSP_GGML_OP_DIAG_MASK_INF; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_DIAG_MASK_INF; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_diag_mask_inf( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_diag_mask_inf( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past) { - return wsp_ggml_diag_mask_inf_impl(ctx, a, n_past, false); + return ggml_diag_mask_inf_impl(ctx, a, n_past, false); } - -struct wsp_ggml_tensor * wsp_ggml_diag_mask_inf_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_diag_mask_inf_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past) { - return wsp_ggml_diag_mask_inf_impl(ctx, a, n_past, true); + return ggml_diag_mask_inf_impl(ctx, a, n_past, true); } -// wsp_ggml_diag_mask_zero +// ggml_diag_mask_zero -struct wsp_ggml_tensor * wsp_ggml_diag_mask_zero_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +static struct ggml_tensor * ggml_diag_mask_zero_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past, bool inplace) { bool is_node = false; @@ -6826,45 +6866,37 @@ struct wsp_ggml_tensor * wsp_ggml_diag_mask_zero_impl( is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); - - wsp_ggml_scratch_save(ctx); - - struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 2); - wsp_ggml_set_name(b, "n_past, inplace"); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = inplace ? 1 : 0; + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - wsp_ggml_scratch_load(ctx); + int32_t params[] = { n_past }; + ggml_set_op_params(result, params, sizeof(params)); - result->op = WSP_GGML_OP_DIAG_MASK_ZERO; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_DIAG_MASK_ZERO; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_diag_mask_zero( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_diag_mask_zero( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past) { - return wsp_ggml_diag_mask_zero_impl(ctx, a, n_past, false); + return ggml_diag_mask_zero_impl(ctx, a, n_past, false); } -struct wsp_ggml_tensor * wsp_ggml_diag_mask_zero_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_diag_mask_zero_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past) { - return wsp_ggml_diag_mask_zero_impl(ctx, a, n_past, true); + return ggml_diag_mask_zero_impl(ctx, a, n_past, true); } -// wsp_ggml_soft_max +// ggml_soft_max -struct wsp_ggml_tensor * wsp_ggml_soft_max_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +static struct ggml_tensor * ggml_soft_max_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, bool inplace) { bool is_node = false; @@ -6872,35 +6904,34 @@ struct wsp_ggml_tensor * wsp_ggml_soft_max_impl( is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_SOFT_MAX; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; + result->op = GGML_OP_SOFT_MAX; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_soft_max( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_soft_max_impl(ctx, a, false); +struct ggml_tensor * ggml_soft_max( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_soft_max_impl(ctx, a, false); } -struct wsp_ggml_tensor * wsp_ggml_soft_max_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a) { - return wsp_ggml_soft_max_impl(ctx, a, true); +struct ggml_tensor * ggml_soft_max_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_soft_max_impl(ctx, a, true); } -// wsp_ggml_soft_max_back +// ggml_soft_max_back -struct wsp_ggml_tensor * wsp_ggml_soft_max_back_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +static struct ggml_tensor * ggml_soft_max_back_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, bool inplace) { bool is_node = false; @@ -6908,98 +6939,136 @@ struct wsp_ggml_tensor * wsp_ggml_soft_max_back_impl( is_node = true; // TODO : implement backward pass } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_SOFT_MAX_BACK; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_SOFT_MAX_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_soft_max_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_soft_max_back_impl(ctx, a, b, false); +struct ggml_tensor * ggml_soft_max_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_soft_max_back_impl(ctx, a, b, false); } -struct wsp_ggml_tensor * wsp_ggml_soft_max_back_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - return wsp_ggml_soft_max_back_impl(ctx, a, b, true); +struct ggml_tensor * ggml_soft_max_back_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_soft_max_back_impl(ctx, a, b, true); } -// wsp_ggml_rope +// ggml_rope -struct wsp_ggml_tensor * wsp_ggml_rope_impl( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +static struct ggml_tensor * ggml_rope_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past, int n_dims, int mode, int n_ctx, + float freq_base, + float freq_scale, + float xpos_base, + bool xpos_down, bool inplace) { - WSP_GGML_ASSERT(n_past >= 0); + GGML_ASSERT(n_past >= 0); bool is_node = false; if (a->grad) { is_node = true; } - struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - wsp_ggml_scratch_save(ctx); + int32_t params[8] = { n_past, n_dims, mode, n_ctx }; + memcpy(params + 4, &freq_base, sizeof(float)); + memcpy(params + 5, &freq_scale, sizeof(float)); + memcpy(params + 6, &xpos_base, sizeof(float)); + memcpy(params + 7, &xpos_down, sizeof(bool)); + ggml_set_op_params(result, params, sizeof(params)); - struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 4); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = n_dims; - ((int32_t *) b->data)[2] = mode; - ((int32_t *) b->data)[3] = n_ctx; - - wsp_ggml_scratch_load(ctx); - - result->op = WSP_GGML_OP_ROPE; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_ROPE; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_rope( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_rope( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past, int n_dims, int mode, int n_ctx) { - return wsp_ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, false); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false); } -struct wsp_ggml_tensor * wsp_ggml_rope_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_rope_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past, int n_dims, int mode, int n_ctx) { - return wsp_ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, true); + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true); +} + +struct ggml_tensor * ggml_rope_custom( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false); +} + +struct ggml_tensor * ggml_rope_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale) { + return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true); +} + +struct ggml_tensor * ggml_rope_xpos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + float base, + bool down) { + return ggml_rope_impl(ctx, a, n_past, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true); } -// wsp_ggml_rope_back +// ggml_rope_back -struct wsp_ggml_tensor * wsp_ggml_rope_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_rope_back( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past, int n_dims, - int mode) { - WSP_GGML_ASSERT(n_past >= 0); - WSP_GGML_ASSERT((mode & 4) == 0 && "wsp_ggml_rope_back() for ChatGLM not implemented yet"); + int mode, + int n_ctx, + float freq_base, + float freq_scale, + float xpos_base, + bool xpos_down) { + GGML_ASSERT(n_past >= 0); + GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet"); bool is_node = false; @@ -7007,150 +7076,136 @@ struct wsp_ggml_tensor * wsp_ggml_rope_back( is_node = false; // TODO: implement backward } - struct wsp_ggml_tensor * result = wsp_ggml_dup_tensor(ctx, a); - - wsp_ggml_scratch_save(ctx); - - struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 3); - wsp_ggml_set_name(b, "n_past, n_dims, mode"); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = n_dims; - ((int32_t *) b->data)[2] = mode; + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - wsp_ggml_scratch_load(ctx); + int32_t params[8] = { n_past, n_dims, mode, n_ctx }; + memcpy(params + 4, &freq_base, sizeof(float)); + memcpy(params + 5, &freq_scale, sizeof(float)); + memcpy(params + 6, &xpos_base, sizeof(float)); + memcpy(params + 7, &xpos_down, sizeof(bool)); + ggml_set_op_params(result, params, sizeof(params)); - result->op = WSP_GGML_OP_ROPE_BACK; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_ROPE_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -// wsp_ggml_alibi +// ggml_alibi -struct wsp_ggml_tensor * wsp_ggml_alibi( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_alibi( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past, int n_head, float bias_max) { - WSP_GGML_ASSERT(n_past >= 0); + GGML_ASSERT(n_past >= 0); bool is_node = false; if (a->grad) { - WSP_GGML_ASSERT(false); // TODO: implement backward + GGML_ASSERT(false); // TODO: implement backward is_node = true; } // TODO: when implement backward, fix this: - //struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); - struct wsp_ggml_tensor * result = wsp_ggml_view_tensor(ctx, a); + //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_view_tensor(ctx, a); - wsp_ggml_scratch_save(ctx); + int32_t op_params[3] = { n_past, n_head }; + memcpy(op_params + 2, &bias_max, sizeof(float)); + ggml_set_op_params(result, op_params, sizeof(op_params)); - struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 3); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = n_head; - WSP_GGML_ASSERT(sizeof(float) == sizeof(int32_t)); - (((float *) b->data)[2]) = bias_max; - - wsp_ggml_scratch_load(ctx); - - result->op = WSP_GGML_OP_ALIBI; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_ALIBI; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -// wsp_ggml_clamp +// ggml_clamp -struct wsp_ggml_tensor * wsp_ggml_clamp( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_clamp( + struct ggml_context * ctx, + struct ggml_tensor * a, float min, float max) { bool is_node = false; if (a->grad) { - WSP_GGML_ASSERT(false); // TODO: implement backward + GGML_ASSERT(false); // TODO: implement backward is_node = true; } // TODO: when implement backward, fix this: - struct wsp_ggml_tensor * result = wsp_ggml_view_tensor(ctx, a); - - wsp_ggml_scratch_save(ctx); + struct ggml_tensor * result = ggml_view_tensor(ctx, a); - struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, 2); + float params[] = { min, max }; + ggml_set_op_params(result, params, sizeof(params)); - ((float *) b->data)[0] = min; - ((float *) b->data)[1] = max; - - wsp_ggml_scratch_load(ctx); - - result->op = WSP_GGML_OP_CLAMP; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_CLAMP; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -// wsp_ggml_conv_1d +// ggml_conv_1d -static int64_t wsp_ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) { +static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) { return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; } -WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_1d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +GGML_API struct ggml_tensor * ggml_conv_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, int s0, int p0, int d0) { - WSP_GGML_ASSERT(wsp_ggml_is_matrix(b)); - WSP_GGML_ASSERT(a->ne[1] == b->ne[1]); + GGML_ASSERT(ggml_is_matrix(b)); + GGML_ASSERT(a->ne[1] == b->ne[1]); bool is_node = false; if (a->grad || b->grad) { - WSP_GGML_ASSERT(false); // TODO: implement backward + GGML_ASSERT(false); // TODO: implement backward is_node = true; } const int64_t ne[4] = { - wsp_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), + ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), a->ne[2], 1, 1, }; - struct wsp_ggml_tensor* result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, 2, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); - wsp_ggml_scratch_save(ctx); - struct wsp_ggml_tensor* c = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 3); - ((int32_t*)c->data)[0] = s0; - ((int32_t*)c->data)[1] = p0; - ((int32_t*)c->data)[2] = d0; - wsp_ggml_scratch_load(ctx); + int32_t params[] = { s0, p0, d0 }; + ggml_set_op_params(result, params, sizeof(params)); - result->op = WSP_GGML_OP_CONV_1D; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->op = GGML_OP_CONV_1D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -// wsp_ggml_conv_2d +// ggml_conv_1d_ph + +struct ggml_tensor* ggml_conv_1d_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s, + int d) { + return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); +} + +// ggml_conv_2d -struct wsp_ggml_tensor* wsp_ggml_conv_2d( - struct wsp_ggml_context* ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, +struct ggml_tensor * ggml_conv_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, int s0, int s1, int p0, @@ -7158,198 +7213,348 @@ struct wsp_ggml_tensor* wsp_ggml_conv_2d( int d0, int d1) { - WSP_GGML_ASSERT(b->ne[3] == 1); - WSP_GGML_ASSERT(a->ne[2] == b->ne[2]); + GGML_ASSERT(a->ne[2] == b->ne[2]); bool is_node = false; if (a->grad || b->grad) { - WSP_GGML_ASSERT(false); // TODO: implement backward + GGML_ASSERT(false); // TODO: implement backward is_node = true; } const int64_t ne[4] = { - wsp_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), - wsp_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), - a->ne[3], 1, + ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), + ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), + a->ne[3], b->ne[3], }; - struct wsp_ggml_tensor* result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, 4, ne); - - wsp_ggml_scratch_save(ctx); - struct wsp_ggml_tensor* c = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 6); - ((int32_t*)c->data)[0] = s0; - ((int32_t*)c->data)[1] = s1; - ((int32_t*)c->data)[2] = p0; - ((int32_t*)c->data)[3] = p1; - ((int32_t*)c->data)[4] = d0; - ((int32_t*)c->data)[5] = d1; - wsp_ggml_scratch_load(ctx); - - result->op = WSP_GGML_OP_CONV_2D; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - return result; + int32_t params[] = { s0, s1, p0, p1, d0, d1 }; + ggml_set_op_params(result, params, sizeof(params)); -} + result->op = GGML_OP_CONV_2D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; -// wsp_ggml_conv_1d_ph + return result; -struct wsp_ggml_tensor* wsp_ggml_conv_1d_ph( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - int s, - int d) { - return wsp_ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); } -// wsp_ggml_flash_attn +// ggml_conv_2d_sk_p0 -struct wsp_ggml_tensor * wsp_ggml_flash_attn( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * q, - struct wsp_ggml_tensor * k, - struct wsp_ggml_tensor * v, - bool masked) { - WSP_GGML_ASSERT(wsp_ggml_can_mul_mat(k, q)); - // TODO: check if vT can be multiplied by (k*qT) - - bool is_node = false; +struct ggml_tensor * ggml_conv_2d_sk_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1); +} - if (q->grad || k->grad || v->grad) { - is_node = true; - } +// ggml_conv_2d_s1_ph - //struct wsp_ggml_tensor * result = wsp_ggml_dup_tensor(ctx, q); - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, 4, q->ne); +struct ggml_tensor * ggml_conv_2d_s1_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1); +} - result->op = WSP_GGML_OP_FLASH_ATTN; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = q; - result->src1 = k; - result->opt[0] = v; - result->opt[1] = wsp_ggml_new_i32(ctx, masked ? 1 : 0); +// ggml_conv_transpose_2d_p0 - return result; +static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { + return (ins - 1) * s - 2 * p + ks; } -// wsp_ggml_flash_ff - -struct wsp_ggml_tensor * wsp_ggml_flash_ff( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b0, - struct wsp_ggml_tensor * b1, - struct wsp_ggml_tensor * c0, - struct wsp_ggml_tensor * c1) { - WSP_GGML_ASSERT(wsp_ggml_can_mul_mat(b0, a)); - // TODO: more checks +struct ggml_tensor * ggml_conv_transpose_2d_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride) { + GGML_ASSERT(a->ne[3] == b->ne[2]); bool is_node = false; - if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) { + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward is_node = true; } - //struct wsp_ggml_tensor * result = wsp_ggml_dup_tensor(ctx, a); - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, 4, a->ne); + const int64_t ne[4] = { + ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/), + ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/), + a->ne[2], b->ne[3], + }; + + struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + ggml_set_op_params_i32(result, 0, stride); - result->op = WSP_GGML_OP_FLASH_FF; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b0; - result->opt[0] = b1; - result->opt[1] = c0; - result->opt[2] = c1; + result->op = GGML_OP_CONV_TRANSPOSE_2D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -// wsp_ggml_flash_attn_back - -struct wsp_ggml_tensor * wsp_ggml_flash_attn_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * q, - struct wsp_ggml_tensor * k, - struct wsp_ggml_tensor * v, - struct wsp_ggml_tensor * d, - bool masked) { - WSP_GGML_ASSERT(wsp_ggml_can_mul_mat(k, q)); - // TODO: check if vT can be multiplied by (k*qT) +// ggml_pool_* - // d shape [D,N,ne2,ne3] - // q shape [D,N,ne2,ne3] - // k shape [D,M,ne2,ne3] - // v shape [M,D,ne2,ne3] +static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) { + return (ins + 2 * p - ks) / s + 1; +} - const int64_t D = q->ne[0]; - const int64_t N = q->ne[1]; - const int64_t M = k->ne[1]; - const int64_t ne2 = q->ne[2]; - const int64_t ne3 = q->ne[3]; +// ggml_pool_1d - WSP_GGML_ASSERT(k->ne[0] == D); - WSP_GGML_ASSERT(v->ne[0] == M); - WSP_GGML_ASSERT(v->ne[1] == D); - WSP_GGML_ASSERT(d->ne[0] == D); - WSP_GGML_ASSERT(d->ne[1] == N); - WSP_GGML_ASSERT(k->ne[2] == ne2); - WSP_GGML_ASSERT(k->ne[3] == ne3); - WSP_GGML_ASSERT(v->ne[2] == ne2); - WSP_GGML_ASSERT(v->ne[3] == ne3); - WSP_GGML_ASSERT(d->ne[2] == ne2); - WSP_GGML_ASSERT(d->ne[3] == ne3); +struct ggml_tensor * ggml_pool_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int s0, + int p0) { bool is_node = false; - if (q->grad || k->grad || v->grad) { - // when using this operation (in backwards pass) these grads are set. - // we don't want to create (big) grad of our result, so is_node is false. - is_node = false; + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; } - // store gradients of q, k and v as continuous tensors concatenated in result. - // q shape[D,N,ne2,ne3] ; k shape [D,M,ne2,ne3] ; v shape [M,D,ne2,ne3] - // gradq->data = result->data - // gradk->data = result->data + nb0*D*N*ne2*ne3 - // gradv->data = result->data + nb0*D*N*ne2*ne3 + nb0*D*M*ne2*ne3 - // note: v and gradv are actually transposed, i.e. v->ne[0] != D. - int64_t ne[4] = {D,M+N+M,ne2,ne3}; + const int64_t ne[3] = { + ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), + a->ne[1], + }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, 4, ne); + int32_t params[] = { op, k0, s0, p0 }; + ggml_set_op_params(result, params, sizeof(params)); - result->op = WSP_GGML_OP_FLASH_ATTN_BACK; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = q; - result->src1 = k; - result->opt[0] = v; - result->opt[1] = d; - result->opt[2] = wsp_ggml_new_i32(ctx, masked ? 1 : 0); + result->op = GGML_OP_POOL_1D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -// wsp_ggml_win_part +// ggml_pool_2d -struct wsp_ggml_tensor * wsp_ggml_win_part( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - int w) { - WSP_GGML_ASSERT(a->ne[3] == 1); - WSP_GGML_ASSERT(a->type == WSP_GGML_TYPE_F32); +struct ggml_tensor * ggml_pool_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int k1, + int s0, + int s1, + int p0, + int p1) { bool is_node = false; if (a->grad) { - WSP_GGML_ASSERT(false); // TODO: implement backward + GGML_ASSERT(false); // TODO: implement backward is_node = true; } - // padding - const int px = (w - a->ne[1]%w)%w; - const int py = (w - a->ne[2]%w)%w; + const int64_t ne[3] = { + ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), + ggml_calc_pool_output_size(a->ne[1], k1, s1, p1), + a->ne[2], + }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); + + int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_POOL_2D; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +// ggml_upscale + +static struct ggml_tensor * ggml_upscale_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor) { + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, + a->ne[0] * scale_factor, + a->ne[1] * scale_factor, + a->ne[2], a->ne[3]); + + result->op = GGML_OP_UPSCALE; + result->op_params[0] = scale_factor; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = NULL; + + return result; +} + +struct ggml_tensor * ggml_upscale( + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor) { + return ggml_upscale_impl(ctx, a, scale_factor); +} + +// ggml_flash_attn + +struct ggml_tensor * ggml_flash_attn( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + bool masked) { + GGML_ASSERT(ggml_can_mul_mat(k, q)); + // TODO: check if vT can be multiplied by (k*qT) + + bool is_node = false; + + if (q->grad || k->grad || v->grad) { + is_node = true; + } + + //struct ggml_tensor * result = ggml_dup_tensor(ctx, q); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne); + + int32_t t = masked ? 1 : 0; + ggml_set_op_params(result, &t, sizeof(t)); + + result->op = GGML_OP_FLASH_ATTN; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = q; + result->src[1] = k; + result->src[2] = v; + + return result; +} + +// ggml_flash_ff + +struct ggml_tensor * ggml_flash_ff( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b0, + struct ggml_tensor * b1, + struct ggml_tensor * c0, + struct ggml_tensor * c1) { + GGML_ASSERT(ggml_can_mul_mat(b0, a)); + // TODO: more checks + + bool is_node = false; + + if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) { + is_node = true; + } + + //struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne); + + result->op = GGML_OP_FLASH_FF; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b0; + result->src[2] = b1; + result->src[3] = c0; + result->src[4] = c1; + + return result; +} + +// ggml_flash_attn_back + +struct ggml_tensor * ggml_flash_attn_back( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * d, + bool masked) { + GGML_ASSERT(ggml_can_mul_mat(k, q)); + // TODO: check if vT can be multiplied by (k*qT) + + // d shape [D,N,ne2,ne3] + // q shape [D,N,ne2,ne3] + // k shape [D,M,ne2,ne3] + // v shape [M,D,ne2,ne3] + + const int64_t D = q->ne[0]; + const int64_t N = q->ne[1]; + const int64_t M = k->ne[1]; + const int64_t ne2 = q->ne[2]; + const int64_t ne3 = q->ne[3]; + + GGML_ASSERT(k->ne[0] == D); + GGML_ASSERT(v->ne[0] == M); + GGML_ASSERT(v->ne[1] == D); + GGML_ASSERT(d->ne[0] == D); + GGML_ASSERT(d->ne[1] == N); + GGML_ASSERT(k->ne[2] == ne2); + GGML_ASSERT(k->ne[3] == ne3); + GGML_ASSERT(v->ne[2] == ne2); + GGML_ASSERT(v->ne[3] == ne3); + GGML_ASSERT(d->ne[2] == ne2); + GGML_ASSERT(d->ne[3] == ne3); + + bool is_node = false; + + if (q->grad || k->grad || v->grad) { + // when using this operation (in backwards pass) these grads are set. + // we don't want to create (big) grad of our result, so is_node is false. + is_node = false; + } + + // store gradients of q, k and v as continuous tensors concatenated in result. + // q shape[D,N,ne2,ne3] ; k shape [D,M,ne2,ne3] ; v shape [M,D,ne2,ne3] + // gradq->data = result->data + // gradk->data = result->data + nb0*D*N*ne2*ne3 + // gradv->data = result->data + nb0*D*N*ne2*ne3 + nb0*D*M*ne2*ne3 + // note: v and gradv are actually transposed, i.e. v->ne[0] != D. + int64_t ne[4] = {D,M+N+M,ne2,ne3}; + + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + int32_t masked_i = masked ? 1 : 0; + ggml_set_op_params(result, &masked_i, sizeof(masked_i)); + + result->op = GGML_OP_FLASH_ATTN_BACK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = q; + result->src[1] = k; + result->src[2] = v; + result->src[3] = d; + + return result; +} + +// ggml_win_part + +struct ggml_tensor * ggml_win_part( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w) { + GGML_ASSERT(a->ne[3] == 1); + GGML_ASSERT(a->type == GGML_TYPE_F32); + + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + // padding + const int px = (w - a->ne[1]%w)%w; + const int py = (w - a->ne[2]%w)%w; const int npx = (px + a->ne[1])/w; const int npy = (py + a->ne[2])/w; @@ -7357,117 +7562,214 @@ struct wsp_ggml_tensor * wsp_ggml_win_part( const int64_t ne[4] = { a->ne[0], w, w, np, }; - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, 4, ne); - - wsp_ggml_scratch_save(ctx); - - struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 3); - - ((int32_t *) b->data)[0] = npx; - ((int32_t *) b->data)[1] = npy; - ((int32_t *) b->data)[2] = w; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - wsp_ggml_scratch_load(ctx); + int32_t params[] = { npx, npy, w }; + ggml_set_op_params(result, params, sizeof(params)); - result->op = WSP_GGML_OP_WIN_PART; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = b; + result->op = GGML_OP_WIN_PART; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -// wsp_ggml_win_unpart +// ggml_win_unpart -struct wsp_ggml_tensor * wsp_ggml_win_unpart( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, +struct ggml_tensor * ggml_win_unpart( + struct ggml_context * ctx, + struct ggml_tensor * a, int w0, int h0, int w) { - WSP_GGML_ASSERT(a->type == WSP_GGML_TYPE_F32); + GGML_ASSERT(a->type == GGML_TYPE_F32); bool is_node = false; if (a->grad) { - WSP_GGML_ASSERT(false); // TODO: implement backward + GGML_ASSERT(false); // TODO: implement backward is_node = true; } const int64_t ne[4] = { a->ne[0], w0, h0, 1, }; - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, 3, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); - wsp_ggml_scratch_save(ctx); + int32_t params[] = { w }; + ggml_set_op_params(result, params, sizeof(params)); - struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, 1); + result->op = GGML_OP_WIN_UNPART; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; - ((int32_t *) b->data)[0] = w; + return result; +} + +// ggml_get_rel_pos + +struct ggml_tensor * ggml_get_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + int qh, + int kh) { + GGML_ASSERT(qh == kh); + GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]); + + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } - wsp_ggml_scratch_load(ctx); + const int64_t ne[4] = { a->ne[0], kh, qh, 1, }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne); - result->op = WSP_GGML_OP_WIN_UNPART; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = NULL; - result->opt[0] = b; + result->op = GGML_OP_GET_REL_POS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = NULL; return result; } -// wsp_ggml_map_unary +// ggml_add_rel_pos + +static struct ggml_tensor * ggml_add_rel_pos_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph, + bool inplace) { + GGML_ASSERT(ggml_are_same_shape(pw, ph)); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_is_contiguous(pw)); + GGML_ASSERT(ggml_is_contiguous(ph)); + GGML_ASSERT(ph->type == GGML_TYPE_F32); + GGML_ASSERT(pw->type == GGML_TYPE_F32); + GGML_ASSERT(pw->ne[3] == a->ne[2]); + GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]); + GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]); -struct wsp_ggml_tensor * wsp_ggml_map_unary_impl_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - const wsp_ggml_unary_op_f32_t fun, - bool inplace) { bool is_node = false; - if (!inplace && a->grad) { + if (!inplace && (a->grad || pw->grad || ph->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + ggml_set_op_params_i32(result, 0, inplace ? 1 : 0); + + result->op = GGML_OP_ADD_REL_POS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = pw; + result->src[2] = ph; + + return result; +} + + +struct ggml_tensor * ggml_add_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph) { + return ggml_add_rel_pos_impl(ctx, a, pw, ph, false); +} + +struct ggml_tensor * ggml_add_rel_pos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph) { + return ggml_add_rel_pos_impl(ctx, a, pw, ph, true); +} + +// gmml_unary + +static struct ggml_tensor * ggml_unary_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { is_node = true; } - struct wsp_ggml_tensor *result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_set_op_params_i32(result, 0, (int32_t) op); + + result->op = GGML_OP_UNARY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_unary( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op) { + return ggml_unary_impl(ctx, a, op, false); +} + +struct ggml_tensor * ggml_unary_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op) { + return ggml_unary_impl(ctx, a, op, true); +} + +// ggml_map_unary + +static struct ggml_tensor * ggml_map_unary_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun, + bool inplace) { + bool is_node = false; - wsp_ggml_scratch_save(ctx); + if (!inplace && a->grad) { + is_node = true; + } - struct wsp_ggml_tensor * addr_tensor = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - wsp_ggml_scratch_load(ctx); + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = WSP_GGML_OP_MAP_UNARY; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->opt[0] = addr_tensor; + result->op = GGML_OP_MAP_UNARY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_map_unary_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - const wsp_ggml_unary_op_f32_t fun) { - return wsp_ggml_map_unary_impl_f32(ctx, a, fun, false); +struct ggml_tensor * ggml_map_unary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun) { + return ggml_map_unary_impl_f32(ctx, a, fun, false); } -struct wsp_ggml_tensor * wsp_ggml_map_unary_inplace_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - const wsp_ggml_unary_op_f32_t fun) { - return wsp_ggml_map_unary_impl_f32(ctx, a, fun, true); +struct ggml_tensor * ggml_map_unary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_unary_op_f32_t fun) { + return ggml_map_unary_impl_f32(ctx, a, fun, true); } -// wsp_ggml_map_binary +// ggml_map_binary -struct wsp_ggml_tensor * wsp_ggml_map_binary_impl_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - const wsp_ggml_binary_op_f32_t fun, +static struct ggml_tensor * ggml_map_binary_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun, bool inplace) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(a, b)); + GGML_ASSERT(ggml_are_same_shape(a, b)); bool is_node = false; @@ -7475,46 +7777,40 @@ struct wsp_ggml_tensor * wsp_ggml_map_binary_impl_f32( is_node = true; } - struct wsp_ggml_tensor *result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - wsp_ggml_scratch_save(ctx); + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - struct wsp_ggml_tensor * addr_tensor = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - - wsp_ggml_scratch_load(ctx); - - result->op = WSP_GGML_OP_MAP_BINARY; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = addr_tensor; + result->op = GGML_OP_MAP_BINARY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_map_binary_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - const wsp_ggml_binary_op_f32_t fun) { - return wsp_ggml_map_binary_impl_f32(ctx, a, b, fun, false); +struct ggml_tensor * ggml_map_binary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun) { + return ggml_map_binary_impl_f32(ctx, a, b, fun, false); } -struct wsp_ggml_tensor * wsp_ggml_map_binary_inplace_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - const wsp_ggml_binary_op_f32_t fun) { - return wsp_ggml_map_binary_impl_f32(ctx, a, b, fun, true); +struct ggml_tensor * ggml_map_binary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_binary_op_f32_t fun) { + return ggml_map_binary_impl_f32(ctx, a, b, fun, true); } -// wsp_ggml_map_custom1 +// ggml_map_custom1_f32 -struct wsp_ggml_tensor * wsp_ggml_map_custom1_impl_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - const wsp_ggml_custom1_op_f32_t fun, +static struct ggml_tensor * ggml_map_custom1_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_f32_t fun, bool inplace) { bool is_node = false; @@ -7522,203 +7818,369 @@ struct wsp_ggml_tensor * wsp_ggml_map_custom1_impl_f32( is_node = true; } - struct wsp_ggml_tensor *result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); + + result->op = GGML_OP_MAP_CUSTOM1_F32; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_map_custom1_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_f32_t fun) { + return ggml_map_custom1_impl_f32(ctx, a, fun, false); +} + +struct ggml_tensor * ggml_map_custom1_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_f32_t fun) { + return ggml_map_custom1_impl_f32(ctx, a, fun, true); +} + +// ggml_map_custom2_f32 - wsp_ggml_scratch_save(ctx); +static struct ggml_tensor * ggml_map_custom2_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_f32_t fun, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad || b->grad)) { + is_node = true; + } - struct wsp_ggml_tensor * addr_tensor = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - wsp_ggml_scratch_load(ctx); + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); - result->op = WSP_GGML_OP_MAP_CUSTOM1; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->opt[0] = addr_tensor; + result->op = GGML_OP_MAP_CUSTOM2_F32; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -struct wsp_ggml_tensor * wsp_ggml_map_custom1_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - const wsp_ggml_custom1_op_f32_t fun) { - return wsp_ggml_map_custom1_impl_f32(ctx, a, fun, false); +struct ggml_tensor * ggml_map_custom2_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_f32_t fun) { + return ggml_map_custom2_impl_f32(ctx, a, b, fun, false); } -struct wsp_ggml_tensor * wsp_ggml_map_custom1_inplace_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - const wsp_ggml_custom1_op_f32_t fun) { - return wsp_ggml_map_custom1_impl_f32(ctx, a, fun, true); +struct ggml_tensor * ggml_map_custom2_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_f32_t fun) { + return ggml_map_custom2_impl_f32(ctx, a, b, fun, true); } -// wsp_ggml_map_custom2 +// ggml_map_custom3_f32 -struct wsp_ggml_tensor * wsp_ggml_map_custom2_impl_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - const wsp_ggml_custom2_op_f32_t fun, +static struct ggml_tensor * ggml_map_custom3_impl_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_f32_t fun, bool inplace) { bool is_node = false; - if (!inplace && (a->grad || b->grad)) { + if (!inplace && (a->grad || b->grad || c->grad)) { is_node = true; } - struct wsp_ggml_tensor *result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); + + result->op = GGML_OP_MAP_CUSTOM3_F32; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; + + return result; +} + +struct ggml_tensor * ggml_map_custom3_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_f32_t fun) { + return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false); +} + +struct ggml_tensor * ggml_map_custom3_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_f32_t fun) { + return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true); +} + +// ggml_map_custom1 +struct ggml_map_custom1_op_params { + ggml_custom1_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom1_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); - wsp_ggml_scratch_save(ctx); + bool is_node = false; + + if (!inplace && a->grad) { + is_node = true; + } - struct wsp_ggml_tensor * addr_tensor = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - wsp_ggml_scratch_load(ctx); + struct ggml_map_custom1_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); - result->op = WSP_GGML_OP_MAP_CUSTOM2; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = addr_tensor; + result->op = GGML_OP_MAP_CUSTOM1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; return result; } -struct wsp_ggml_tensor * wsp_ggml_map_custom2_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - const wsp_ggml_custom2_op_f32_t fun) { - return wsp_ggml_map_custom2_impl_f32(ctx, a, b, fun, false); +struct ggml_tensor * ggml_map_custom1( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false); } -struct wsp_ggml_tensor * wsp_ggml_map_custom2_inplace_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - const wsp_ggml_custom2_op_f32_t fun) { - return wsp_ggml_map_custom2_impl_f32(ctx, a, b, fun, true); +struct ggml_tensor * ggml_map_custom1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + const ggml_custom1_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true); } -// wsp_ggml_map_custom3 +// ggml_map_custom2 + +struct ggml_map_custom2_op_params { + ggml_custom2_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom2_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); -struct wsp_ggml_tensor * wsp_ggml_map_custom3_impl_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - struct wsp_ggml_tensor * c, - const wsp_ggml_custom3_op_f32_t fun, - bool inplace) { bool is_node = false; - if (!inplace && (a->grad || b->grad || c->grad)) { + if (!inplace && (a->grad || b->grad)) { is_node = true; } - struct wsp_ggml_tensor *result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + struct ggml_map_custom2_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); + + result->op = GGML_OP_MAP_CUSTOM2; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_map_custom2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false); +} + +struct ggml_tensor * ggml_map_custom2_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const ggml_custom2_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true); +} + +// ggml_map_custom3 + +struct ggml_map_custom3_op_params { + ggml_custom3_op_t fun; + int n_tasks; + void * userdata; +}; + +static struct ggml_tensor * ggml_map_custom3_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata, + bool inplace) { + GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0); + + bool is_node = false; - wsp_ggml_scratch_save(ctx); + if (!inplace && (a->grad || b->grad || c->grad)) { + is_node = true; + } - struct wsp_ggml_tensor * addr_tensor = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - wsp_ggml_scratch_load(ctx); + struct ggml_map_custom3_op_params params = { + /*.fun =*/ fun, + /*.n_tasks =*/ n_tasks, + /*.userdata =*/ userdata + }; + ggml_set_op_params(result, (const void *) ¶ms, sizeof(params)); - result->op = WSP_GGML_OP_MAP_CUSTOM3; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = addr_tensor; - result->opt[1] = c; + result->op = GGML_OP_MAP_CUSTOM3; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; } -struct wsp_ggml_tensor * wsp_ggml_map_custom3_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - struct wsp_ggml_tensor * c, - const wsp_ggml_custom3_op_f32_t fun) { - return wsp_ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false); +struct ggml_tensor * ggml_map_custom3( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false); } -struct wsp_ggml_tensor * wsp_ggml_map_custom3_inplace_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - struct wsp_ggml_tensor * c, - const wsp_ggml_custom3_op_f32_t fun) { - return wsp_ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true); +struct ggml_tensor * ggml_map_custom3_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + const ggml_custom3_op_t fun, + int n_tasks, + void * userdata) { + return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true); } -// wsp_ggml_cross_entropy_loss -struct wsp_ggml_tensor * wsp_ggml_cross_entropy_loss( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(a, b)); + +// ggml_cross_entropy_loss + +struct ggml_tensor * ggml_cross_entropy_loss( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(ggml_are_same_shape(a, b)); bool is_node = false; if (a->grad || b->grad) { is_node = true; } - struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_1d(ctx, a->type, 1); + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1); - result->op = WSP_GGML_OP_CROSS_ENTROPY_LOSS; - result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL; - result->src0 = a; - result->src1 = b; + result->op = GGML_OP_CROSS_ENTROPY_LOSS; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; return result; } -// wsp_ggml_cross_entropy_loss_back +// ggml_cross_entropy_loss_back -struct wsp_ggml_tensor * wsp_ggml_cross_entropy_loss_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - struct wsp_ggml_tensor * c) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(a, b)); - WSP_GGML_ASSERT(wsp_ggml_is_scalar(c)); +struct ggml_tensor * ggml_cross_entropy_loss_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c) { + GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_ASSERT(ggml_is_scalar(c)); - struct wsp_ggml_tensor * result = wsp_ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - result->op = WSP_GGML_OP_CROSS_ENTROPY_LOSS_BACK; + result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK; result->grad = NULL; - result->src0 = a; - result->src1 = b; - result->opt[0] = c; + result->src[0] = a; + result->src[1] = b; + result->src[2] = c; return result; } //////////////////////////////////////////////////////////////////////////////// -void wsp_ggml_set_param( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * tensor) { +void ggml_set_param( + struct ggml_context * ctx, + struct ggml_tensor * tensor) { tensor->is_param = true; - WSP_GGML_ASSERT(tensor->grad == NULL); - tensor->grad = wsp_ggml_dup_tensor(ctx, tensor); + GGML_ASSERT(tensor->grad == NULL); + tensor->grad = ggml_dup_tensor(ctx, tensor); } -// wsp_ggml_compute_forward_dup +// ggml_compute_forward_dup -static void wsp_ggml_compute_forward_dup_same_cont( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_nelements(dst) == wsp_ggml_nelements(src0)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst) && wsp_ggml_is_contiguous(src0)); - WSP_GGML_ASSERT(src0->type == dst->type); +static void ggml_compute_forward_dup_same_cont( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + GGML_ASSERT(src0->type == dst->type); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -7729,7 +8191,7 @@ static void wsp_ggml_compute_forward_dup_same_cont( const int nth = params->nth; // number of threads // parallelize by elements - const int ne = wsp_ggml_nelements(dst); + const int ne = ggml_nelements(dst); const int dr = (ne + nth - 1) / nth; const int ie0 = dr * ith; const int ie1 = MIN(ie0 + dr, ne); @@ -7738,27 +8200,27 @@ static void wsp_ggml_compute_forward_dup_same_cont( memcpy( ((char *) dst->data + ie0*nb0), ((char *) src0->data + ie0*nb00), - (ie1 - ie0) * WSP_GGML_TYPE_SIZE[src0->type]); + (ie1 - ie0) * ggml_type_size(src0->type)); } } -static void wsp_ggml_compute_forward_dup_f16( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_nelements(dst) == wsp_ggml_nelements(src0)); +static void ggml_compute_forward_dup_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; const int ith = params->ith; // thread index const int nth = params->nth; // number of threads - if (wsp_ggml_is_contiguous(src0) && wsp_ggml_is_contiguous(dst) && src0->type == dst->type) { - wsp_ggml_compute_forward_dup_same_cont(params, src0, dst); + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { + ggml_compute_forward_dup_same_cont(params, src0, dst); return; } @@ -7772,7 +8234,7 @@ static void wsp_ggml_compute_forward_dup_f16( if (src0->type == dst->type && ne00 == ne0 && - nb00 == WSP_GGML_TYPE_SIZE[src0->type] && nb0 == WSP_GGML_TYPE_SIZE[dst->type]) { + nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { // copy by rows const size_t rs = ne00*nb00; for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -7790,9 +8252,9 @@ static void wsp_ggml_compute_forward_dup_f16( // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy - if (wsp_ggml_is_contiguous(dst)) { - if (nb00 == sizeof(wsp_ggml_fp16_t)) { - if (dst->type == WSP_GGML_TYPE_F16) { + if (ggml_is_contiguous(dst)) { + if (nb00 == sizeof(ggml_fp16_t)) { + if (dst->type == GGML_TYPE_F16) { size_t id = 0; const size_t rs = ne00 * nb00; char * dst_ptr = (char *) dst->data; @@ -7808,7 +8270,7 @@ static void wsp_ggml_compute_forward_dup_f16( id += rs * (ne01 - ir1); } } - } else if (dst->type == WSP_GGML_TYPE_F32) { + } else if (dst->type == GGML_TYPE_F32) { size_t id = 0; float * dst_ptr = (float *) dst->data; @@ -7816,31 +8278,31 @@ static void wsp_ggml_compute_forward_dup_f16( for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { - const wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - dst_ptr[id] = WSP_GGML_FP16_TO_FP32(src0_ptr[i00]); + dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]); id++; } } id += ne00 * (ne01 - ir1); } } - } else if (wsp_ggml_is_quantized(dst->type)) { - quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + } else if (type_traits[dst->type].from_float) { + ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; size_t id = 0; - size_t rs = nb0 * (ne00 / WSP_GGML_BLCK_SIZE[dst->type]); + size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += rs * ir0; for (int i01 = ir0; i01 < ir1; i01++) { - const wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - src0_f32[i00] = WSP_GGML_FP16_TO_FP32(src0_ptr[i00]); + src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]); } quantize_row_q(src0_f32, dst_ptr + id, ne00); @@ -7850,12 +8312,12 @@ static void wsp_ggml_compute_forward_dup_f16( } } } else { - WSP_GGML_ASSERT(false); // TODO: implement + GGML_ASSERT(false); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); - if (dst->type == WSP_GGML_TYPE_F32) { + if (dst->type == GGML_TYPE_F32) { size_t id = 0; float * dst_ptr = (float *) dst->data; @@ -7864,25 +8326,25 @@ static void wsp_ggml_compute_forward_dup_f16( id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = WSP_GGML_FP16_TO_FP32(*src0_ptr); + dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); id++; } } id += ne00 * (ne01 - ir1); } } - } else if (dst->type == WSP_GGML_TYPE_F16) { + } else if (dst->type == GGML_TYPE_F16) { size_t id = 0; - wsp_ggml_fp16_t * dst_ptr = (wsp_ggml_fp16_t *) dst->data; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { id += ne00 * ir0; for (int i01 = ir0; i01 < ir1; i01++) { for (int i00 = 0; i00 < ne00; i00++) { - const wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); dst_ptr[id] = *src0_ptr; id++; @@ -7892,7 +8354,7 @@ static void wsp_ggml_compute_forward_dup_f16( } } } else { - WSP_GGML_ASSERT(false); // TODO: implement + GGML_ASSERT(false); // TODO: implement } } return; @@ -7904,7 +8366,7 @@ static void wsp_ggml_compute_forward_dup_f16( int64_t i12 = 0; int64_t i13 = 0; - if (dst->type == WSP_GGML_TYPE_F16) { + if (dst->type == GGML_TYPE_F16) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { i10 += ne00 * ir0; @@ -7925,7 +8387,7 @@ static void wsp_ggml_compute_forward_dup_f16( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - memcpy(dst_ptr, src0_ptr, sizeof(wsp_ggml_fp16_t)); + memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t)); if (++i10 == ne00) { i10 = 0; @@ -7956,7 +8418,7 @@ static void wsp_ggml_compute_forward_dup_f16( } } } - } else if (dst->type == WSP_GGML_TYPE_F32) { + } else if (dst->type == GGML_TYPE_F32) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { i10 += ne00 * ir0; @@ -7977,7 +8439,7 @@ static void wsp_ggml_compute_forward_dup_f16( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(float *) dst_ptr = WSP_GGML_FP16_TO_FP32(*(const wsp_ggml_fp16_t *) src0_ptr); + *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); if (++i10 == ne0) { i10 = 0; @@ -8009,27 +8471,27 @@ static void wsp_ggml_compute_forward_dup_f16( } } } else { - WSP_GGML_ASSERT(false); // TODO: implement + GGML_ASSERT(false); // TODO: implement } } -static void wsp_ggml_compute_forward_dup_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_nelements(dst) == wsp_ggml_nelements(src0)); +static void ggml_compute_forward_dup_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; const int ith = params->ith; // thread index const int nth = params->nth; // number of threads - if (wsp_ggml_is_contiguous(src0) && wsp_ggml_is_contiguous(dst) && src0->type == dst->type) { - wsp_ggml_compute_forward_dup_same_cont(params, src0, dst); + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { + ggml_compute_forward_dup_same_cont(params, src0, dst); return; } @@ -8043,7 +8505,7 @@ static void wsp_ggml_compute_forward_dup_f32( if (src0->type == dst->type && ne00 == ne0 && - nb00 == WSP_GGML_TYPE_SIZE[src0->type] && nb0 == WSP_GGML_TYPE_SIZE[dst->type]) { + nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { // copy by rows const size_t rs = ne00*nb00; for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -8059,10 +8521,10 @@ static void wsp_ggml_compute_forward_dup_f32( return; } - if (wsp_ggml_is_contiguous(dst)) { + if (ggml_is_contiguous(dst)) { // TODO: simplify if (nb00 == sizeof(float)) { - if (dst->type == WSP_GGML_TYPE_F32) { + if (dst->type == GGML_TYPE_F32) { size_t id = 0; const size_t rs = ne00 * nb00; char * dst_ptr = (char *) dst->data; @@ -8078,29 +8540,11 @@ static void wsp_ggml_compute_forward_dup_f32( id += rs * (ne01 - ir1); } } - } else if (dst->type == WSP_GGML_TYPE_F16) { - size_t id = 0; - wsp_ggml_fp16_t * dst_ptr = (wsp_ggml_fp16_t *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += ne00 * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - for (int i00 = 0; i00 < ne00; i00++) { - const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - - dst_ptr[id] = WSP_GGML_FP32_TO_FP16(*src0_ptr); - id++; - } - } - id += ne00 * (ne01 - ir1); - } - } - } else if (wsp_ggml_is_quantized(dst->type)) { - quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q; + } else if (type_traits[dst->type].from_float) { + ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; size_t id = 0; - size_t rs = nb0 * (ne00 / WSP_GGML_BLCK_SIZE[dst->type]); + size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { @@ -8115,12 +8559,12 @@ static void wsp_ggml_compute_forward_dup_f32( } } } else { - WSP_GGML_ASSERT(false); // TODO: implement + GGML_ASSERT(false); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); - if (dst->type == WSP_GGML_TYPE_F32) { + if (dst->type == GGML_TYPE_F32) { size_t id = 0; float * dst_ptr = (float *) dst->data; @@ -8138,9 +8582,9 @@ static void wsp_ggml_compute_forward_dup_f32( id += ne00 * (ne01 - ir1); } } - } else if (dst->type == WSP_GGML_TYPE_F16) { + } else if (dst->type == GGML_TYPE_F16) { size_t id = 0; - wsp_ggml_fp16_t * dst_ptr = (wsp_ggml_fp16_t *) dst->data; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { for (int i02 = 0; i02 < ne02; i02++) { @@ -8149,7 +8593,7 @@ static void wsp_ggml_compute_forward_dup_f32( for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = WSP_GGML_FP32_TO_FP16(*src0_ptr); + dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); id++; } } @@ -8157,7 +8601,7 @@ static void wsp_ggml_compute_forward_dup_f32( } } } else { - WSP_GGML_ASSERT(false); // TODO: implement + GGML_ASSERT(false); // TODO: implement } } @@ -8171,7 +8615,7 @@ static void wsp_ggml_compute_forward_dup_f32( int64_t i12 = 0; int64_t i13 = 0; - if (dst->type == WSP_GGML_TYPE_F32) { + if (dst->type == GGML_TYPE_F32) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { i10 += ne00 * ir0; @@ -8223,7 +8667,7 @@ static void wsp_ggml_compute_forward_dup_f32( } } } - } else if (dst->type == WSP_GGML_TYPE_F16) { + } else if (dst->type == GGML_TYPE_F16) { for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { i10 += ne00 * ir0; @@ -8244,7 +8688,7 @@ static void wsp_ggml_compute_forward_dup_f32( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(wsp_ggml_fp16_t *) dst_ptr = WSP_GGML_FP32_TO_FP16(*(const float *) src0_ptr); + *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); if (++i10 == ne0) { i10 = 0; @@ -8276,56 +8720,56 @@ static void wsp_ggml_compute_forward_dup_f32( } } } else { - WSP_GGML_ASSERT(false); // TODO: implement + GGML_ASSERT(false); // TODO: implement } } -static void wsp_ggml_compute_forward_dup( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - if (wsp_ggml_is_contiguous(src0) && wsp_ggml_is_contiguous(dst) && src0->type == dst->type) { - wsp_ggml_compute_forward_dup_same_cont(params, src0, dst); +static void ggml_compute_forward_dup( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { + ggml_compute_forward_dup_same_cont(params, src0, dst); return; } switch (src0->type) { - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - wsp_ggml_compute_forward_dup_f16(params, src0, dst); + ggml_compute_forward_dup_f16(params, src0, dst); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_dup_f32(params, src0, dst); + ggml_compute_forward_dup_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_add +// ggml_compute_forward_add -static void wsp_ggml_compute_forward_add_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, src1) && wsp_ggml_are_same_shape(src0, dst)); +static void ggml_compute_forward_add_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; - WSP_GGML_ASSERT( nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8336,23 +8780,23 @@ static void wsp_ggml_compute_forward_add_f32( if (nb10 == sizeof(float)) { for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; -#ifdef WSP_GGML_USE_ACCELERATE - vDSP_vadd( - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, - ne0); + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + +#ifdef GGML_USE_ACCELERATE + vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00); #else - wsp_ggml_vec_add_f32(ne0, - (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), - (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), - (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr); #endif // } // } @@ -8360,15 +8804,20 @@ static void wsp_ggml_compute_forward_add_f32( } else { // src1 is not contiguous for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); for (int i0 = 0; i0 < ne0; i0++) { - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10); + float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10); dst_ptr[i0] = src0_ptr[i0] + *src1_ptr; } @@ -8376,30 +8825,30 @@ static void wsp_ggml_compute_forward_add_f32( } } -static void wsp_ggml_compute_forward_add_f16_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, src1) && wsp_ggml_are_same_shape(src0, dst)); +static void ggml_compute_forward_add_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; - WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32); - WSP_GGML_ASSERT(dst->type == WSP_GGML_TYPE_F16); + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F16); - WSP_GGML_ASSERT( nb0 == sizeof(wsp_ggml_fp16_t)); - WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t)); + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8415,45 +8864,45 @@ static void wsp_ggml_compute_forward_add_f16_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - wsp_ggml_fp16_t * dst_ptr = (wsp_ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); } } } else { // src1 is not contiguous - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } } -static void wsp_ggml_compute_forward_add_f16_f16( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, src1) && wsp_ggml_are_same_shape(src0, dst)); +static void ggml_compute_forward_add_f16_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; - WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F16); - WSP_GGML_ASSERT(dst->type == WSP_GGML_TYPE_F16); + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); - WSP_GGML_ASSERT( nb0 == sizeof(wsp_ggml_fp16_t)); - WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t)); + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8462,62 +8911,62 @@ static void wsp_ggml_compute_forward_add_f16_f16( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - if (nb10 == sizeof(wsp_ggml_fp16_t)) { + if (nb10 == sizeof(ggml_fp16_t)) { for (int ir = ir0; ir < ir1; ++ir) { // src0, src1 and dst are same shape => same indices const int i3 = ir/(ne2*ne1); const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - wsp_ggml_fp16_t * dst_ptr = (wsp_ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - wsp_ggml_fp16_t * src1_ptr = (wsp_ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(src0_ptr[i]) + WSP_GGML_FP16_TO_FP32(src1_ptr[i])); + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i])); } } } else { // src1 is not contiguous - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } } -static void wsp_ggml_compute_forward_add_q_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, src1) && wsp_ggml_are_same_shape(src0, dst)); +static void ggml_compute_forward_add_q_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; - const enum wsp_ggml_type type = src0->type; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; - quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; + const enum ggml_type type = src0->type; + ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; + ggml_from_float_t const quantize_row_q = type_traits[type].from_float; // we don't support permuted src0 or src1 - WSP_GGML_ASSERT(nb00 == WSP_GGML_TYPE_SIZE[type]); - WSP_GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); // dst cannot be transposed or permuted - WSP_GGML_ASSERT(nb0 <= nb1); - WSP_GGML_ASSERT(nb1 <= nb2); - WSP_GGML_ASSERT(nb2 <= nb3); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); - WSP_GGML_ASSERT(wsp_ggml_is_quantized(src0->type)); - WSP_GGML_ASSERT(dst->type == src0->type); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32); + GGML_ASSERT(ggml_is_quantized(src0->type)); + GGML_ASSERT(dst->type == src0->type); + GGML_ASSERT(src1->type == GGML_TYPE_F32); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8552,77 +9001,77 @@ static void wsp_ggml_compute_forward_add_q_f32( // unquantize row from src0 to temp buffer dequantize_row_q(src0_row, wdata, ne00); // add src1 - wsp_ggml_vec_acc_f32(ne00, wdata, src1_row); + ggml_vec_acc_f32(ne00, wdata, src1_row); // quantize row to dst quantize_row_q(wdata, dst_row, ne00); } } -static void wsp_ggml_compute_forward_add( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_add( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_add_f32(params, src0, src1, dst); + ggml_compute_forward_add_f32(params, src0, src1, dst); } break; - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - if (src1->type == WSP_GGML_TYPE_F16) { - wsp_ggml_compute_forward_add_f16_f16(params, src0, src1, dst); + if (src1->type == GGML_TYPE_F16) { + ggml_compute_forward_add_f16_f16(params, src0, src1, dst); } - else if (src1->type == WSP_GGML_TYPE_F32) { - wsp_ggml_compute_forward_add_f16_f32(params, src0, src1, dst); + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add_f16_f32(params, src0, src1, dst); } else { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } } break; - case WSP_GGML_TYPE_Q4_0: - case WSP_GGML_TYPE_Q4_1: - case WSP_GGML_TYPE_Q5_0: - case WSP_GGML_TYPE_Q5_1: - case WSP_GGML_TYPE_Q8_0: - case WSP_GGML_TYPE_Q2_K: - case WSP_GGML_TYPE_Q3_K: - case WSP_GGML_TYPE_Q4_K: - case WSP_GGML_TYPE_Q5_K: - case WSP_GGML_TYPE_Q6_K: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: { - wsp_ggml_compute_forward_add_q_f32(params, src0, src1, dst); + ggml_compute_forward_add_q_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_add1 +// ggml_compute_forward_add1 -static void wsp_ggml_compute_forward_add1_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); - WSP_GGML_ASSERT(wsp_ggml_is_scalar(src1)); +static void ggml_compute_forward_add1_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; - WSP_GGML_ASSERT( nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8637,8 +9086,8 @@ static void wsp_ggml_compute_forward_add1_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); -#ifdef WSP_GGML_USE_ACCELERATE - UNUSED(wsp_ggml_vec_add1_f32); +#ifdef GGML_USE_ACCELERATE + UNUSED(ggml_vec_add1_f32); vDSP_vadd( (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, @@ -8646,7 +9095,7 @@ static void wsp_ggml_compute_forward_add1_f32( (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, ne0); #else - wsp_ggml_vec_add1_f32(ne0, + ggml_vec_add1_f32(ne0, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), *(float *) src1->data); @@ -8654,15 +9103,15 @@ static void wsp_ggml_compute_forward_add1_f32( } } -static void wsp_ggml_compute_forward_add1_f16_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); - WSP_GGML_ASSERT(wsp_ggml_is_scalar(src1)); +static void ggml_compute_forward_add1_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -8672,16 +9121,16 @@ static void wsp_ggml_compute_forward_add1_f16_f32( const int ith = params->ith; const int nth = params->nth; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; - WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32); - WSP_GGML_ASSERT(dst->type == WSP_GGML_TYPE_F16); + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F16); - WSP_GGML_ASSERT( nb0 == sizeof(wsp_ggml_fp16_t)); - WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t)); + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8696,42 +9145,42 @@ static void wsp_ggml_compute_forward_add1_f16_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - wsp_ggml_fp16_t * dst_ptr = (wsp_ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(src0_ptr[i]) + v); + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); } } } -static void wsp_ggml_compute_forward_add1_f16_f16( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); - WSP_GGML_ASSERT(wsp_ggml_is_scalar(src1)); +static void ggml_compute_forward_add1_f16_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } // scalar to add - const float v = WSP_GGML_FP16_TO_FP32(*(wsp_ggml_fp16_t *) src1->data); + const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); const int ith = params->ith; const int nth = params->nth; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; - WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F16); - WSP_GGML_ASSERT(dst->type == WSP_GGML_TYPE_F16); + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F16); - WSP_GGML_ASSERT( nb0 == sizeof(wsp_ggml_fp16_t)); - WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t)); + GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8746,23 +9195,23 @@ static void wsp_ggml_compute_forward_add1_f16_f16( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - wsp_ggml_fp16_t * dst_ptr = (wsp_ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); - wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(src0_ptr[i]) + v); + dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); } } } -static void wsp_ggml_compute_forward_add1_q_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); - WSP_GGML_ASSERT(wsp_ggml_is_scalar(src1)); +static void ggml_compute_forward_add1_q_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -8772,25 +9221,25 @@ static void wsp_ggml_compute_forward_add1_q_f32( const int ith = params->ith; const int nth = params->nth; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; - const enum wsp_ggml_type type = src0->type; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; - quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; + const enum ggml_type type = src0->type; + ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; + ggml_from_float_t const quantize_row_q = type_traits[type].from_float; // we don't support permuted src0 - WSP_GGML_ASSERT(nb00 == WSP_GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb00 == ggml_type_size(type)); // dst cannot be transposed or permuted - WSP_GGML_ASSERT(nb0 <= nb1); - WSP_GGML_ASSERT(nb1 <= nb2); - WSP_GGML_ASSERT(nb2 <= nb3); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); - WSP_GGML_ASSERT(wsp_ggml_is_quantized(src0->type)); - WSP_GGML_ASSERT(dst->type == src0->type); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32); + GGML_ASSERT(ggml_is_quantized(src0->type)); + GGML_ASSERT(dst->type == src0->type); + GGML_ASSERT(src1->type == GGML_TYPE_F32); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8815,112 +9264,108 @@ static void wsp_ggml_compute_forward_add1_q_f32( // unquantize row from src0 to temp buffer dequantize_row_q(src0_row, wdata, ne0); // add src1 - wsp_ggml_vec_acc1_f32(ne0, wdata, v); + ggml_vec_acc1_f32(ne0, wdata, v); // quantize row to dst quantize_row_q(wdata, dst_row, ne0); } } -static void wsp_ggml_compute_forward_add1( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_add1( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_add1_f32(params, src0, src1, dst); + ggml_compute_forward_add1_f32(params, src0, src1, dst); } break; - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - if (src1->type == WSP_GGML_TYPE_F16) { - wsp_ggml_compute_forward_add1_f16_f16(params, src0, src1, dst); + if (src1->type == GGML_TYPE_F16) { + ggml_compute_forward_add1_f16_f16(params, src0, src1, dst); } - else if (src1->type == WSP_GGML_TYPE_F32) { - wsp_ggml_compute_forward_add1_f16_f32(params, src0, src1, dst); + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add1_f16_f32(params, src0, src1, dst); } else { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } } break; - case WSP_GGML_TYPE_Q4_0: - case WSP_GGML_TYPE_Q4_1: - case WSP_GGML_TYPE_Q5_0: - case WSP_GGML_TYPE_Q5_1: - case WSP_GGML_TYPE_Q8_0: - case WSP_GGML_TYPE_Q8_1: - case WSP_GGML_TYPE_Q2_K: - case WSP_GGML_TYPE_Q3_K: - case WSP_GGML_TYPE_Q4_K: - case WSP_GGML_TYPE_Q5_K: - case WSP_GGML_TYPE_Q6_K: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: { - wsp_ggml_compute_forward_add1_q_f32(params, src0, src1, dst); + ggml_compute_forward_add1_q_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_acc +// ggml_compute_forward_acc -static void wsp_ggml_compute_forward_acc_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst) && wsp_ggml_is_contiguous(src0)); - - WSP_GGML_ASSERT(opt0->type == WSP_GGML_TYPE_I32); - WSP_GGML_ASSERT(wsp_ggml_nelements(opt0) == 5); +static void ggml_compute_forward_acc_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); // view src0 and dst with these strides and data offset inbytes during acc // nb0 is implicitely element_size because src0 and dst are contiguous - size_t nb1 = ((int32_t *) opt0->data)[0]; - size_t nb2 = ((int32_t *) opt0->data)[1]; - size_t nb3 = ((int32_t *) opt0->data)[2]; - size_t offset = ((int32_t *) opt0->data)[3]; - bool inplace = (bool) ((int32_t *) opt0->data)[4]; + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - if (!inplace && (params->type == WSP_GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_TASK_INIT)) { // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase memcpy( ((char *) dst->data), ((char *) src0->data), - wsp_ggml_nbytes(dst)); + ggml_nbytes(dst)); } - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; - const int nr = wsp_ggml_nrows(src1); + const int nr = ggml_nrows(src1); const int nc = src1->ne[0]; - WSP_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); // src0 and dst as viewed during acc - const size_t nb0 = wsp_ggml_element_size(src0); + const size_t nb0 = ggml_element_size(src0); const size_t nb00 = nb0; const size_t nb01 = nb1; const size_t nb02 = nb2; const size_t nb03 = nb3; - WSP_GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0 + (ne11 == 0 ? 0 : ne11-1)*nb1 + (ne12 == 0 ? 0 : ne12-1)*nb2 + (ne13 == 0 ? 0 : ne13-1)*nb3 < wsp_ggml_nbytes(dst)); - WSP_GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < wsp_ggml_nbytes(src0)); + GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb0 + (ne11 == 0 ? 0 : ne11-1)*nb1 + (ne12 == 0 ? 0 : ne12-1)*nb2 + (ne13 == 0 ? 0 : ne13-1)*nb3 < ggml_nbytes(dst)); + GGML_ASSERT(offset + (ne10 == 0 ? 0 : ne10-1)*nb00 + (ne11 == 0 ? 0 : ne11-1)*nb01 + (ne12 == 0 ? 0 : ne12-1)*nb02 + (ne13 == 0 ? 0 : ne13-1)*nb03 < ggml_nbytes(src0)); - WSP_GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -8936,13 +9381,13 @@ static void wsp_ggml_compute_forward_acc_f32( const int i2 = (ir - i3*ne12*ne11)/ne11; const int i1 = (ir - i3*ne12*ne11 - i2*ne11); -#ifdef WSP_GGML_USE_ACCELERATE +#ifdef GGML_USE_ACCELERATE vDSP_vadd( (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), 1, (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), 1, nc); #else - wsp_ggml_vec_add_f32(nc, + ggml_vec_add_f32(nc, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + offset), (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); @@ -8950,57 +9395,56 @@ static void wsp_ggml_compute_forward_acc_f32( } } -static void wsp_ggml_compute_forward_acc( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_acc( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: - { - wsp_ggml_compute_forward_acc_f32(params, src0, src1, opt0, dst); - } break; - case WSP_GGML_TYPE_F16: - case WSP_GGML_TYPE_Q4_0: - case WSP_GGML_TYPE_Q4_1: - case WSP_GGML_TYPE_Q5_0: - case WSP_GGML_TYPE_Q5_1: - case WSP_GGML_TYPE_Q8_0: - case WSP_GGML_TYPE_Q8_1: - case WSP_GGML_TYPE_Q2_K: - case WSP_GGML_TYPE_Q3_K: - case WSP_GGML_TYPE_Q4_K: - case WSP_GGML_TYPE_Q5_K: - case WSP_GGML_TYPE_Q6_K: + case GGML_TYPE_F32: + { + ggml_compute_forward_acc_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_sub +// ggml_compute_forward_sub -static void wsp_ggml_compute_forward_sub_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_sub_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { assert(params->ith == 0); - assert(wsp_ggml_are_same_shape(src0, src1) && wsp_ggml_are_same_shape(src0, dst)); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; - WSP_GGML_ASSERT( nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); if (nb10 == sizeof(float)) { for (int ir = 0; ir < nr; ++ir) { @@ -9010,14 +9454,14 @@ static void wsp_ggml_compute_forward_sub_f32( const int i1 = (ir - i3*ne2*ne1 - i2*ne1); -#ifdef WSP_GGML_USE_ACCELERATE +#ifdef GGML_USE_ACCELERATE vDSP_vsub( (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, ne0); #else - wsp_ggml_vec_sub_f32(ne0, + ggml_vec_sub_f32(ne0, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); @@ -9044,54 +9488,54 @@ static void wsp_ggml_compute_forward_sub_f32( } } -static void wsp_ggml_compute_forward_sub( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_sub( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_sub_f32(params, src0, src1, dst); + ggml_compute_forward_sub_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_mul +// ggml_compute_forward_mul -static void wsp_ggml_compute_forward_mul_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_can_repeat_rows(src1, src0) && wsp_ggml_are_same_shape(src0, dst)); +static void ggml_compute_forward_mul_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; -#ifdef WSP_GGML_USE_CLBLAST - if (src1->backend == WSP_GGML_BACKEND_GPU) { +#ifdef GGML_USE_CLBLAST + if (src1->backend == GGML_BACKEND_GPU) { if (ith == 0) { - wsp_ggml_cl_mul(src0, src1, dst); + ggml_cl_mul(src0, src1, dst); } return; } #endif - const int64_t nr = wsp_ggml_nrows(src0); + const int64_t nr = ggml_nrows(src0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; - WSP_GGML_ASSERT( nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb00 == sizeof(float)); - WSP_GGML_ASSERT(ne00 == ne10); + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(ne00 == ne10); if (nb10 == sizeof(float)) { for (int64_t ir = ith; ir < nr; ir += nth) { @@ -9108,12 +9552,12 @@ static void wsp_ggml_compute_forward_mul_f32( float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); -#ifdef WSP_GGML_USE_ACCELERATE - UNUSED(wsp_ggml_vec_mul_f32); +#ifdef GGML_USE_ACCELERATE + UNUSED(ggml_vec_mul_f32); vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00); #else - wsp_ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr); + ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr); #endif // } // } @@ -9143,43 +9587,45 @@ static void wsp_ggml_compute_forward_mul_f32( } } -static void wsp_ggml_compute_forward_mul( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_mul( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); + switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_mul_f32(params, src0, src1, dst); + ggml_compute_forward_mul_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_div +// ggml_compute_forward_div -static void wsp_ggml_compute_forward_div_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_div_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { assert(params->ith == 0); - assert(wsp_ggml_are_same_shape(src0, src1) && wsp_ggml_are_same_shape(src0, dst)); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; - WSP_GGML_ASSERT( nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); if (nb10 == sizeof(float)) { for (int ir = 0; ir < nr; ++ir) { @@ -9189,14 +9635,16 @@ static void wsp_ggml_compute_forward_div_f32( const int i1 = (ir - i3*ne2*ne1 - i2*ne1); -#ifdef WSP_GGML_USE_ACCELERATE +#ifdef GGML_USE_ACCELERATE + UNUSED(ggml_vec_div_f32); + vDSP_vdiv( (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1, ne0); #else - wsp_ggml_vec_div_f32(ne0, + ggml_vec_div_f32(ne0, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); @@ -9223,176 +9671,176 @@ static void wsp_ggml_compute_forward_div_f32( } } -static void wsp_ggml_compute_forward_div( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_div( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_div_f32(params, src0, src1, dst); + ggml_compute_forward_div_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_sqr +// ggml_compute_forward_sqr -static void wsp_ggml_compute_forward_sqr_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_sqr_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - assert(wsp_ggml_are_same_shape(src0, dst)); + assert(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; assert( dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_sqr_f32(nc, + ggml_vec_sqr_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void wsp_ggml_compute_forward_sqr( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_sqr( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_sqr_f32(params, src0, dst); + ggml_compute_forward_sqr_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_sqrt +// ggml_compute_forward_sqrt -static void wsp_ggml_compute_forward_sqrt_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_sqrt_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - assert(wsp_ggml_are_same_shape(src0, dst)); + assert(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; assert( dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_sqrt_f32(nc, + ggml_vec_sqrt_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void wsp_ggml_compute_forward_sqrt( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_sqrt( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_sqrt_f32(params, src0, dst); + ggml_compute_forward_sqrt_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_log +// ggml_compute_forward_log -static void wsp_ggml_compute_forward_log_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(params->ith == 0); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); +static void ggml_compute_forward_log_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; - WSP_GGML_ASSERT( dst->nb[0] == sizeof(float)); - WSP_GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT( dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_log_f32(nc, + ggml_vec_log_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void wsp_ggml_compute_forward_log( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_log( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_log_f32(params, src0, dst); + ggml_compute_forward_log_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_sum +// ggml_compute_forward_sum -static void wsp_ggml_compute_forward_sum_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_sum_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - assert(wsp_ggml_is_scalar(dst)); + assert(ggml_is_scalar(dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - assert(wsp_ggml_is_scalar(dst)); + assert(ggml_is_scalar(dst)); assert(src0->nb[0] == sizeof(float)); - WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); - wsp_ggml_float sum = 0; - wsp_ggml_float row_sum = 0; + ggml_float sum = 0; + ggml_float row_sum = 0; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - wsp_ggml_vec_sum_ggf(ne00, + ggml_vec_sum_f32_ggf(ne00, &row_sum, (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); sum += row_sum; @@ -9402,88 +9850,124 @@ static void wsp_ggml_compute_forward_sum_f32( ((float *) dst->data)[0] = sum; } -static void wsp_ggml_compute_forward_sum( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_sum_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + assert(params->ith == 0); + assert(ggml_is_scalar(dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + assert(src0->nb[0] == sizeof(ggml_fp16_t)); + + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb); + + float sum = 0; + float row_sum = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_f16_ggf(ne00, + &row_sum, + (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); + sum += row_sum; + } + } + } + ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum); +} + +static void ggml_compute_forward_sum( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: + { + ggml_compute_forward_sum_f32(params, src0, dst); + } break; + case GGML_TYPE_F16: { - wsp_ggml_compute_forward_sum_f32(params, src0, dst); + ggml_compute_forward_sum_f16(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_sum_rows +// ggml_compute_forward_sum_rows -static void wsp_ggml_compute_forward_sum_rows_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(params->ith == 0); +static void ggml_compute_forward_sum_rows_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - WSP_GGML_ASSERT(src0->nb[0] == sizeof(float)); - WSP_GGML_ASSERT(dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(dst->nb[0] == sizeof(float)); - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; - WSP_GGML_ASSERT(ne0 == 1); - WSP_GGML_ASSERT(ne1 == ne01); - WSP_GGML_ASSERT(ne2 == ne02); - WSP_GGML_ASSERT(ne3 == ne03); + GGML_ASSERT(ne0 == 1); + GGML_ASSERT(ne1 == ne01); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); for (int64_t i3 = 0; i3 < ne03; i3++) { for (int64_t i2 = 0; i2 < ne02; i2++) { for (int64_t i1 = 0; i1 < ne01; i1++) { - float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); - float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); + float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); + float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); float row_sum = 0; - wsp_ggml_vec_sum_f32(ne00, &row_sum, src_row); + ggml_vec_sum_f32(ne00, &row_sum, src_row); dst_row[0] = row_sum; } } } } -static void wsp_ggml_compute_forward_sum_rows( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_sum_rows( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_sum_rows_f32(params, src0, dst); + ggml_compute_forward_sum_rows_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_mean +// ggml_compute_forward_mean -static void wsp_ggml_compute_forward_mean_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_mean_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } assert(src0->nb[0] == sizeof(float)); - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; assert(ne0 == 1); assert(ne1 == ne01); @@ -9498,7 +9982,7 @@ static void wsp_ggml_compute_forward_mean_f32( for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - wsp_ggml_vec_sum_f32(ne00, + ggml_vec_sum_f32(ne00, (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); @@ -9508,31 +9992,31 @@ static void wsp_ggml_compute_forward_mean_f32( } } -static void wsp_ggml_compute_forward_mean( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_mean( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_mean_f32(params, src0, dst); + ggml_compute_forward_mean_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_argmax +// ggml_compute_forward_argmax -static void wsp_ggml_compute_forward_argmax_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_argmax_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -9549,51 +10033,51 @@ static void wsp_ggml_compute_forward_argmax_f32( float * src = (float *) ((char *) src0->data + i1*nb01); int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0); int v = 0; - wsp_ggml_vec_argmax_f32(ne00, &v, src); + ggml_vec_argmax_f32(ne00, &v, src); dst_[0] = v; } } -static void wsp_ggml_compute_forward_argmax( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_argmax( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_argmax_f32(params, src0, dst); + ggml_compute_forward_argmax_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_repeat +// ggml_compute_forward_repeat -static void wsp_ggml_compute_forward_repeat_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(params->ith == 0); - WSP_GGML_ASSERT(wsp_ggml_can_repeat(src0, dst)); +static void ggml_compute_forward_repeat_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; - // guaranteed to be an integer due to the check in wsp_ggml_can_repeat + // guaranteed to be an integer due to the check in ggml_can_repeat const int nr0 = (int)(ne0/ne00); const int nr1 = (int)(ne1/ne01); const int nr2 = (int)(ne2/ne02); const int nr3 = (int)(ne3/ne03); // TODO: support for transposed / permuted tensors - WSP_GGML_ASSERT(nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); // TODO: maybe this is not optimal? for (int i3 = 0; i3 < nr3; i3++) { @@ -9603,7 +10087,7 @@ static void wsp_ggml_compute_forward_repeat_f32( for (int i1 = 0; i1 < nr1; i1++) { for (int k1 = 0; k1 < ne01; k1++) { for (int i0 = 0; i0 < nr0; i0++) { - wsp_ggml_vec_cpy_f32(ne00, + ggml_vec_cpy_f32(ne00, (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0), (float *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01)); } @@ -9615,54 +10099,54 @@ static void wsp_ggml_compute_forward_repeat_f32( } } -static void wsp_ggml_compute_forward_repeat( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_repeat( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_repeat_f32(params, src0, dst); + ggml_compute_forward_repeat_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_repeat_back +// ggml_compute_forward_repeat_back -static void wsp_ggml_compute_forward_repeat_back_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(params->ith == 0); - WSP_GGML_ASSERT(wsp_ggml_can_repeat(dst, src0)); +static void ggml_compute_forward_repeat_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(dst, src0)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; - // guaranteed to be an integer due to the check in wsp_ggml_can_repeat + // guaranteed to be an integer due to the check in ggml_can_repeat const int nr0 = (int)(ne00/ne0); const int nr1 = (int)(ne01/ne1); const int nr2 = (int)(ne02/ne2); const int nr3 = (int)(ne03/ne3); // TODO: support for transposed / permuted tensors - WSP_GGML_ASSERT(nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); - if (wsp_ggml_is_contiguous(dst)) { - wsp_ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + if (ggml_is_contiguous(dst)) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); } else { for (int k3 = 0; k3 < ne3; k3++) { for (int k2 = 0; k2 < ne2; k2++) { for (int k1 = 0; k1 < ne1; k1++) { - wsp_ggml_vec_set_f32(ne0, + ggml_vec_set_f32(ne0, (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), 0); } @@ -9678,7 +10162,7 @@ static void wsp_ggml_compute_forward_repeat_back_f32( for (int i1 = 0; i1 < nr1; i1++) { for (int k1 = 0; k1 < ne1; k1++) { for (int i0 = 0; i0 < nr0; i0++) { - wsp_ggml_vec_acc_f32(ne0, + ggml_vec_acc_f32(ne0, (float *) ((char *) dst->data + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1), (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00)); } @@ -9690,327 +10174,393 @@ static void wsp_ggml_compute_forward_repeat_back_f32( } } -static void wsp_ggml_compute_forward_repeat_back( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_repeat_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_repeat_back_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_concat + +static void ggml_compute_forward_concat_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + + GGML_TENSOR_BINARY_OP_LOCALS; + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = ith; i2 < ne2; i2++) { + if (i2 < ne02) { // src0 + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03); + + float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3); + *y = *x; + } + } + } // src1 + else { + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13); + + float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3); + *y = *x; + } + } + } + } + } +} + +static void ggml_compute_forward_concat( + const struct ggml_compute_params* params, + const struct ggml_tensor* src0, + const struct ggml_tensor* src1, + struct ggml_tensor* dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_repeat_back_f32(params, src0, dst); + ggml_compute_forward_concat_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_abs +// ggml_compute_forward_abs -static void wsp_ggml_compute_forward_abs_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_abs_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - assert(wsp_ggml_are_same_shape(src0, dst)); + assert(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; assert(dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_abs_f32(nc, + ggml_vec_abs_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void wsp_ggml_compute_forward_abs( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_abs( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_abs_f32(params, src0, dst); + ggml_compute_forward_abs_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_sgn +// ggml_compute_forward_sgn -static void wsp_ggml_compute_forward_sgn_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_sgn_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - assert(wsp_ggml_are_same_shape(src0, dst)); + assert(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; assert(dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_sgn_f32(nc, + ggml_vec_sgn_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void wsp_ggml_compute_forward_sgn( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_sgn( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_sgn_f32(params, src0, dst); + ggml_compute_forward_sgn_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_neg +// ggml_compute_forward_neg -static void wsp_ggml_compute_forward_neg_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_neg_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - assert(wsp_ggml_are_same_shape(src0, dst)); + assert(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; assert(dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_neg_f32(nc, + ggml_vec_neg_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void wsp_ggml_compute_forward_neg( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_neg( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_neg_f32(params, src0, dst); + ggml_compute_forward_neg_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_step +// ggml_compute_forward_step -static void wsp_ggml_compute_forward_step_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_step_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - assert(wsp_ggml_are_same_shape(src0, dst)); + assert(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; assert(dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_step_f32(nc, + ggml_vec_step_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void wsp_ggml_compute_forward_step( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_step( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_step_f32(params, src0, dst); + ggml_compute_forward_step_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_tanh +// ggml_compute_forward_tanh -static void wsp_ggml_compute_forward_tanh_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_tanh_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - assert(wsp_ggml_are_same_shape(src0, dst)); + assert(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; assert(dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_tanh_f32(nc, + ggml_vec_tanh_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void wsp_ggml_compute_forward_tanh( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_tanh( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_tanh_f32(params, src0, dst); + ggml_compute_forward_tanh_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_elu +// ggml_compute_forward_elu -static void wsp_ggml_compute_forward_elu_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_elu_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - assert(wsp_ggml_are_same_shape(src0, dst)); + assert(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; assert(dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_elu_f32(nc, + ggml_vec_elu_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void wsp_ggml_compute_forward_elu( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_elu( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_elu_f32(params, src0, dst); + ggml_compute_forward_elu_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_relu +// ggml_compute_forward_relu -static void wsp_ggml_compute_forward_relu_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_relu_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - assert(wsp_ggml_are_same_shape(src0, dst)); + assert(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; assert(dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { - wsp_ggml_vec_relu_f32(nc, + ggml_vec_relu_f32(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void wsp_ggml_compute_forward_relu( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_relu( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_relu_f32(params, src0, dst); + ggml_compute_forward_relu_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_gelu +// ggml_compute_forward_gelu -static void wsp_ggml_compute_forward_gelu_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst)); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); +static void ggml_compute_forward_gelu_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); + GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -10018,7 +10568,7 @@ static void wsp_ggml_compute_forward_gelu_f32( const int nth = params->nth; const int nc = src0->ne[0]; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -10028,7 +10578,7 @@ static void wsp_ggml_compute_forward_gelu_f32( const int ir1 = MIN(ir0 + dr, nr); for (int i1 = ir0; i1 < ir1; i1++) { - wsp_ggml_vec_gelu_f32(nc, + ggml_vec_gelu_f32(nc, (float *) ((char *) dst->data + i1*( dst->nb[1])), (float *) ((char *) src0->data + i1*(src0->nb[1]))); @@ -10043,33 +10593,33 @@ static void wsp_ggml_compute_forward_gelu_f32( } } -static void wsp_ggml_compute_forward_gelu( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_gelu( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_gelu_f32(params, src0, dst); + ggml_compute_forward_gelu_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_gelu_quick +// ggml_compute_forward_gelu_quick -static void wsp_ggml_compute_forward_gelu_quick_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst)); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); +static void ggml_compute_forward_gelu_quick_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); + GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -10077,7 +10627,7 @@ static void wsp_ggml_compute_forward_gelu_quick_f32( const int nth = params->nth; const int nc = src0->ne[0]; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -10087,7 +10637,7 @@ static void wsp_ggml_compute_forward_gelu_quick_f32( const int ir1 = MIN(ir0 + dr, nr); for (int i1 = ir0; i1 < ir1; i1++) { - wsp_ggml_vec_gelu_quick_f32(nc, + ggml_vec_gelu_quick_f32(nc, (float *) ((char *) dst->data + i1*( dst->nb[1])), (float *) ((char *) src0->data + i1*(src0->nb[1]))); @@ -10102,33 +10652,33 @@ static void wsp_ggml_compute_forward_gelu_quick_f32( } } -static void wsp_ggml_compute_forward_gelu_quick( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_gelu_quick( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_gelu_quick_f32(params, src0, dst); + ggml_compute_forward_gelu_quick_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_silu +// ggml_compute_forward_silu -static void wsp_ggml_compute_forward_silu_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst)); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); +static void ggml_compute_forward_silu_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); + GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -10136,7 +10686,7 @@ static void wsp_ggml_compute_forward_silu_f32( const int nth = params->nth; const int nc = src0->ne[0]; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -10146,7 +10696,7 @@ static void wsp_ggml_compute_forward_silu_f32( const int ir1 = MIN(ir0 + dr, nr); for (int i1 = ir0; i1 < ir1; i1++) { - wsp_ggml_vec_silu_f32(nc, + ggml_vec_silu_f32(nc, (float *) ((char *) dst->data + i1*( dst->nb[1])), (float *) ((char *) src0->data + i1*(src0->nb[1]))); @@ -10161,37 +10711,36 @@ static void wsp_ggml_compute_forward_silu_f32( } } -static void wsp_ggml_compute_forward_silu( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_silu( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_silu_f32(params, src0, dst); + ggml_compute_forward_silu_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } +// ggml_compute_forward_silu_back -// wsp_ggml_compute_forward_silu_back +static void ggml_compute_forward_silu_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * grad, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad)); + GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0)); + GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_are_same_shape(src0, grad)); -static void wsp_ggml_compute_forward_silu_back_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * grad, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(grad)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst)); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, grad)); - - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -10199,7 +10748,7 @@ static void wsp_ggml_compute_forward_silu_back_f32( const int nth = params->nth; const int nc = src0->ne[0]; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -10209,7 +10758,7 @@ static void wsp_ggml_compute_forward_silu_back_f32( const int ir1 = MIN(ir0 + dr, nr); for (int i1 = ir0; i1 < ir1; i1++) { - wsp_ggml_vec_silu_backward_f32(nc, + ggml_vec_silu_backward_f32(nc, (float *) ((char *) dst->data + i1*( dst->nb[1])), (float *) ((char *) src0->data + i1*(src0->nb[1])), (float *) ((char *) grad->data + i1*(grad->nb[1]))); @@ -10225,43 +10774,44 @@ static void wsp_ggml_compute_forward_silu_back_f32( } } -static void wsp_ggml_compute_forward_silu_back( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * grad, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_silu_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * grad, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_silu_back_f32(params, src0, grad, dst); + ggml_compute_forward_silu_back_f32(params, src0, grad, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_norm +// ggml_compute_forward_norm -static void wsp_ggml_compute_forward_norm_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); +static void ggml_compute_forward_norm_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - WSP_GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); const int ith = params->ith; const int nth = params->nth; - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; - const float eps = 1e-5f; // TODO: make this a parameter + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -10269,65 +10819,68 @@ static void wsp_ggml_compute_forward_norm_f32( for (int64_t i01 = ith; i01 < ne01; i01 += nth) { const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - wsp_ggml_float sum = 0.0; + ggml_float sum = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { - sum += (wsp_ggml_float)x[i00]; + sum += (ggml_float)x[i00]; } float mean = sum/ne00; float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); - wsp_ggml_float sum2 = 0.0; + ggml_float sum2 = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { float v = x[i00] - mean; y[i00] = v; - sum2 += (wsp_ggml_float)(v*v); + sum2 += (ggml_float)(v*v); } float variance = sum2/ne00; const float scale = 1.0f/sqrtf(variance + eps); - wsp_ggml_vec_scale_f32(ne00, y, scale); + ggml_vec_scale_f32(ne00, y, scale); } } } } -static void wsp_ggml_compute_forward_norm( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_norm( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_norm_f32(params, src0, dst); + ggml_compute_forward_norm_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -static void wsp_ggml_compute_forward_rms_norm_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); +// ggml_compute_forward_group_rms_norm + +static void ggml_compute_forward_rms_norm_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - WSP_GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); const int ith = params->ith; const int nth = params->nth; - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; - const float eps = 1e-6f; // TODO: make this a parameter + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -10335,9 +10888,9 @@ static void wsp_ggml_compute_forward_rms_norm_f32( for (int64_t i01 = ith; i01 < ne01; i01 += nth) { const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - wsp_ggml_float sum = 0.0; + ggml_float sum = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { - sum += (wsp_ggml_float)(x[i00] * x[i00]); + sum += (ggml_float)(x[i00] * x[i00]); } const float mean = sum/ne00; @@ -10351,48 +10904,48 @@ static void wsp_ggml_compute_forward_rms_norm_f32( const float scale = 1.0f/sqrtf(mean + eps); - wsp_ggml_vec_scale_f32(ne00, y, scale); + ggml_vec_scale_f32(ne00, y, scale); } } } } -static void wsp_ggml_compute_forward_rms_norm( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_rms_norm( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_rms_norm_f32(params, src0, dst); + ggml_compute_forward_rms_norm_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } +static void ggml_compute_forward_rms_norm_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); -static void wsp_ggml_compute_forward_rms_norm_back_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst) && wsp_ggml_are_same_shape(src0, src1)); - - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - WSP_GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); const int ith = params->ith; const int nth = params->nth; - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; - const float eps = 1e-6f; // TODO: make this a parameter + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -10406,12 +10959,12 @@ static void wsp_ggml_compute_forward_rms_norm_back_f32( const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13); - wsp_ggml_float sum_xx = 0.0; - wsp_ggml_float sum_xdz = 0.0; + ggml_float sum_xx = 0.0; + ggml_float sum_xdz = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { - sum_xx += (wsp_ggml_float)(x[i00] * x[i00]); - sum_xdz += (wsp_ggml_float)(x[i00] * dz[i00]); + sum_xx += (ggml_float)(x[i00] * x[i00]); + sum_xdz += (ggml_float)(x[i00] * dz[i00]); } //const float mean = (float)(sum_xx)/ne00; @@ -10419,7 +10972,7 @@ static void wsp_ggml_compute_forward_rms_norm_back_f32( const float sum_eps = (float)(sum_xx) + eps*ne00; //const float mean_xdz = (float)(sum_xdz)/ne00; // we could cache rms from forward pass to improve performance. - // to do this implement wsp_ggml_rms and compose wsp_ggml_rms_norm using wsp_ggml_rms. + // to do this implement ggml_rms and compose ggml_rms_norm using ggml_rms. //const float rms = sqrtf(mean_eps); const float rrms = 1.0f / sqrtf(mean_eps); //const float scale = -rrms/(ne00 * mean_eps); // -1/(n*rms**3) @@ -10520,283 +11073,249 @@ static void wsp_ggml_compute_forward_rms_norm_back_f32( // dx := scale(dx, rrms) float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); - wsp_ggml_vec_cpy_f32 (ne00, dx, x); - // wsp_ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps); - wsp_ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps); - wsp_ggml_vec_acc_f32 (ne00, dx, dz); - wsp_ggml_vec_scale_f32(ne00, dx, rrms); + ggml_vec_cpy_f32 (ne00, dx, x); + // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps); + ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps); + ggml_vec_acc_f32 (ne00, dx, dz); + ggml_vec_scale_f32(ne00, dx, rrms); } } } } -static void wsp_ggml_compute_forward_rms_norm_back( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_rms_norm_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst); + ggml_compute_forward_rms_norm_back_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } +// ggml_compute_forward_group_norm -// wsp_ggml_compute_forward_mul_mat - -#if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS) -// helper function to determine if it is better to use BLAS or not -// for large matrices, BLAS is faster -static bool wsp_ggml_compute_forward_mul_mat_use_blas( - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - //const int64_t ne00 = src0->ne[0]; - //const int64_t ne01 = src0->ne[1]; - - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these - if (wsp_ggml_is_contiguous(src0) && - wsp_ggml_is_contiguous(src1) && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { +static void ggml_compute_forward_group_norm_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); - /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ - return true; + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; } - return false; -} -#endif - -static void wsp_ggml_compute_forward_mul_mat_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - int64_t t0 = wsp_ggml_perf_time_us(); - UNUSED(t0); - - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_ASSERT(src0->nb[0] == sizeof(float)); const int ith = params->ith; const int nth = params->nth; - assert(ne02 == ne12); - assert(ne03 == ne13); - assert(ne2 == ne12); - assert(ne3 == ne13); - - // we don't support permuted src0 or src1 - assert(nb00 == sizeof(float)); - assert(nb10 == sizeof(float)); - - // dst cannot be transposed or permuted - assert(nb0 == sizeof(float)); - assert(nb0 <= nb1); - assert(nb1 <= nb2); - assert(nb2 <= nb3); + GGML_TENSOR_UNARY_OP_LOCALS; - assert(ne0 == ne01); - assert(ne1 == ne11); - assert(ne2 == ne02); - assert(ne3 == ne03); + const float eps = 1e-6f; // TODO: make this a parameter - // nb01 >= nb00 - src0 is not transposed - // compute by src0 rows + // TODO: optimize -#if defined(WSP_GGML_USE_CLBLAST) - if (wsp_ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == WSP_GGML_TASK_COMPUTE) { - wsp_ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + int n_channels = src0->ne[2]; + int n_groups = dst->op_params[0]; + int n_channels_per_group = (n_channels + n_groups - 1) / n_groups; + for (int i = ith; i < n_groups; i+=nth) { + int start = i * n_channels_per_group; + int end = start + n_channels_per_group; + if (end > n_channels) { + end = n_channels; } - return; - } -#endif + int step = end - start; -#if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS) - if (wsp_ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - if (params->ith != 0) { - return; - } + for (int64_t i03 = 0; i03 < ne03; i03++) { + ggml_float sum = 0.0; + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); - if (params->type == WSP_GGML_TASK_INIT) { - return; - } + for (int64_t i00 = 0; i00 < ne00; i00++) { + sum += (ggml_float)x[i00]; + } + } + } + float mean = sum / (ne00 * ne01 * step); + ggml_float sum2 = 0.0; - if (params->type == WSP_GGML_TASK_FINALIZE) { - return; - } + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); + for (int64_t i00 = 0; i00 < ne00; i00++) { + float v = x[i00] - mean; + y[i00] = v; + sum2 += (ggml_float)(v * v); + } + } } - } - //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (wsp_ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); - - return; - } -#endif + float variance = sum2 / (ne00 * ne01 * step); + const float scale = 1.0f / sqrtf(variance + eps); - if (params->type == WSP_GGML_TASK_INIT) { - return; + for (int64_t i02 = start; i02 < end; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + float * y = (float *)((char *) dst->data + i01 * nb1 + i02 * nb2 + i03 * nb3); + ggml_vec_scale_f32(ne00, y, scale); + } + } + } } +} - if (params->type == WSP_GGML_TASK_FINALIZE) { - return; +static void ggml_compute_forward_group_norm( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_group_norm_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; } +} - // parallelize by src0 rows using wsp_ggml_vec_dot_f32 - - // total rows in src0 - const int nr = ne01*ne02*ne03; +// ggml_compute_forward_mul_mat - // rows per thread - const int dr = (nr + nth - 1)/nth; +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +// helper function to determine if it is better to use BLAS or not +// for large matrices, BLAS is faster +static bool ggml_compute_forward_mul_mat_use_blas( + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + //const int64_t ne00 = src0->ne[0]; + //const int64_t ne01 = src0->ne[1]; - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + const int64_t ne10 = src1->ne[0]; - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; - for (int64_t ic = 0; ic < ne11; ++ic) { - // src1 indices - const int i13 = i03; - const int i12 = i02; - const int i11 = ic; + // TODO: find the optimal values for these + if (ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - // dst indices - const int i0 = i01; - const int i1 = i11; - const int i2 = i02; - const int i3 = i03; - - wsp_ggml_vec_dot_f32(ne00, - (float *) ((char *) dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)), - (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13))); - } + /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ + return true; } - //int64_t t1 = wsp_ggml_perf_time_us(); - //static int64_t acc = 0; - //acc += t1 - t0; - //if (t1 - t0 > 10) { - // printf("\n"); - // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); - // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); - // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); - // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13); - - // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); - //} + return false; } +#endif -static void wsp_ggml_compute_forward_mul_mat_f16_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - int64_t t0 = wsp_ggml_perf_time_us(); +static void ggml_compute_forward_mul_mat( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; - - //const int64_t ne = ne0*ne1*ne2*ne3; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; - WSP_GGML_ASSERT(ne02 == ne12); - WSP_GGML_ASSERT(ne03 == ne13); - WSP_GGML_ASSERT(ne2 == ne12); - WSP_GGML_ASSERT(ne3 == ne13); + const enum ggml_type type = src0->type; + + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; - // TODO: we don't support permuted src0 - WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t)); + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == sizeof(float)); // dst cannot be transposed or permuted - WSP_GGML_ASSERT(nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb0 <= nb1); - WSP_GGML_ASSERT(nb1 <= nb2); - WSP_GGML_ASSERT(nb2 <= nb3); + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); - WSP_GGML_ASSERT(ne0 == ne01); - WSP_GGML_ASSERT(ne1 == ne11); - WSP_GGML_ASSERT(ne2 == ne02); - WSP_GGML_ASSERT(ne3 == ne03); + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(WSP_GGML_USE_CLBLAST) - if (wsp_ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == WSP_GGML_TASK_COMPUTE) { - wsp_ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); +#if defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(src0, src1, dst)) { + // TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension + // ref: https://github.com/ggerganov/ggml/pull/224 + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); } return; } #endif -#if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS) - if (wsp_ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - WSP_GGML_ASSERT(nb10 == sizeof(float)); - +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; } - if (params->type == WSP_GGML_TASK_INIT) { + if (params->type == GGML_TASK_INIT) { return; } - if (params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_FINALIZE) { return; } - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - float * const wdata = params->wdata; - { + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + // broadcast src0 into src1 across 2nd,3rd dimension + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); + + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + + if (type != GGML_TYPE_F32) { + float * const wdata = params->wdata; + ggml_to_float_t const to_float = type_traits[type].to_float; + size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { - for (int64_t i00 = 0; i00 < ne00; ++i00) { - wdata[id++] = WSP_GGML_FP16_TO_FP32(*(wsp_ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); - } + to_float((const char *) x + i01*nb01, wdata + id, ne00); + id += ne00; } assert(id*sizeof(float) <= params->wsize); + x = wdata; } - const float * x = wdata; - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - // zT = y * xT cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, @@ -10805,343 +11324,164 @@ static void wsp_ggml_compute_forward_mul_mat_f16_f32( } } - /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (wsp_ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ + //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; } #endif - if (params->type == WSP_GGML_TASK_INIT) { - wsp_ggml_fp16_t * const wdata = params->wdata; + if (params->type == GGML_TASK_INIT) { + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); - size_t id = 0; - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = 0; i11 < ne11; ++i11) { - for (int64_t i10 = 0; i10 < ne10; ++i10) { - wdata[id++] = WSP_GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10)); + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; } } } } - WSP_GGML_ASSERT(id*sizeof(wsp_ggml_fp16_t) <= params->wsize); - return; } - if (params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_FINALIZE) { return; } - // fp16 -> half the size, so divide by 2 - // TODO: do not support transposed src1 - assert(nb10/2 == sizeof(wsp_ggml_fp16_t)); + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); - // parallelize by src0 rows using wsp_ggml_vec_dot_f16 + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = ne11*ne12*ne13; // src1 rows - // total rows in src0 - const int nr = ne01*ne02*ne03; + //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + // distribute the thread work across the inner or outer loop based on which one is larger - wsp_ggml_fp16_t * wdata = params->wdata; + const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows + const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); + const int64_t ith0 = ith % nth0; + const int64_t ith1 = ith / nth0; - const int i13 = i03; - const int i12 = i02; + const int64_t dr0 = (nr0 + nth0 - 1)/nth0; + const int64_t dr1 = (nr1 + nth1 - 1)/nth1; - const int i0 = i01; - const int i2 = i02; - const int i3 = i03; + const int64_t ir010 = dr0*ith0; + const int64_t ir011 = MIN(ir010 + dr0, nr0); - wsp_ggml_fp16_t * src0_row = (wsp_ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); - wsp_ggml_fp16_t * src1_col = wdata + ( 0 + i12*ne11 + i13*ne12*ne11)*ne00; + const int64_t ir110 = dr1*ith1; + const int64_t ir111 = MIN(ir110 + dr1, nr1); - float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); + //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111); - for (int64_t ic = 0; ic < ne11; ++ic) { - wsp_ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00); - } + // threads with no work simply yield (not sure if it helps) + if (ir010 >= ir011 || ir110 >= ir111) { + sched_yield(); + return; } - //int64_t t1 = wsp_ggml_time_us(); - //static int64_t acc = 0; - //acc += t1 - t0; - //if (t1 - t0 > 10) { - // printf("\n"); - // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); - // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); - // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); - - // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); - //} -} + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); -static void wsp_ggml_compute_forward_mul_mat_q_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - int64_t t0 = wsp_ggml_perf_time_us(); - UNUSED(t0); + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + // attempt to reduce false-sharing (does not seem to make a difference) + float tmp[16]; - const int ith = params->ith; - const int nth = params->nth; + for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { + for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { + const int64_t i13 = (ir1/(ne12*ne11)); + const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11; + const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11); - WSP_GGML_ASSERT(ne02 == ne12); - WSP_GGML_ASSERT(ne03 == ne13); - WSP_GGML_ASSERT(ne2 == ne12); - WSP_GGML_ASSERT(ne3 == ne13); + // broadcast src0 into src1 + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; - const enum wsp_ggml_type type = src0->type; - quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot; - vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q; - enum wsp_ggml_type const vec_dot_type = quantize_fns[type].vec_dot_type; + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; - // we don't support permuted src0 or src1 - WSP_GGML_ASSERT(nb00 == WSP_GGML_TYPE_SIZE[type]); - WSP_GGML_ASSERT(nb10 == sizeof(float)); + const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03); - // dst cannot be transposed or permuted - WSP_GGML_ASSERT(nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb0 <= nb1); - WSP_GGML_ASSERT(nb1 <= nb2); - WSP_GGML_ASSERT(nb2 <= nb3); + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size + : (i11*nb11 + i12*nb12 + i13*nb13)); - WSP_GGML_ASSERT(ne0 == ne01); - WSP_GGML_ASSERT(ne1 == ne11); - WSP_GGML_ASSERT(ne2 == ne02); - WSP_GGML_ASSERT(ne3 == ne03); + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); - // nb01 >= nb00 - src0 is not transposed - // compute by src0 rows + //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} -#if defined(WSP_GGML_USE_CLBLAST) - if (wsp_ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == WSP_GGML_TASK_COMPUTE) { - wsp_ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + } + memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); + } } - return; } -#endif +} -#if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS) - if (wsp_ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - if (params->ith != 0) { - return; - } - - if (params->type == WSP_GGML_TASK_INIT) { - return; - } - - if (params->type == WSP_GGML_TASK_FINALIZE) { - return; - } - - float * const wdata = params->wdata; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - { - size_t id = 0; - for (int64_t i01 = 0; i01 < ne01; ++i01) { - dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); - id += ne00; - } - - assert(id*sizeof(float) <= params->wsize); - } - - const float * x = wdata; - - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - - //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (wsp_ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); - - return; - } -#endif - - if (params->type == WSP_GGML_TASK_INIT) { - char * wdata = params->wdata; - const size_t row_size = ne10*WSP_GGML_TYPE_SIZE[vec_dot_type]/WSP_GGML_BLCK_SIZE[vec_dot_type]; - - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = 0; i11 < ne11; ++i11) { - quantize_row_q_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); - wdata += row_size; - } - } - } - - return; - } - - if (params->type == WSP_GGML_TASK_FINALIZE) { - return; - } - - // parallelize by src0 rows using wsp_ggml_vec_dot_q - - // total rows in src0 - const int nr = ne01*ne02*ne03; - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - void * wdata = params->wdata; - const size_t row_size = ne00*WSP_GGML_TYPE_SIZE[vec_dot_type]/WSP_GGML_BLCK_SIZE[vec_dot_type]; - - for (int ir = ir0; ir < ir1; ++ir) { - // src0 indices - const int i03 = ir/(ne02*ne01); - const int i02 = (ir - i03*ne02*ne01)/ne01; - const int i01 = (ir - i03*ne02*ne01 - i02*ne01); +// ggml_compute_forward_out_prod - const int i13 = i03; - const int i12 = i02; - - const int i0 = i01; - const int i2 = i02; - const int i3 = i03; - - void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); - char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*row_size)); - - float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); - - assert(ne00 % 32 == 0); - - for (int64_t ic = 0; ic < ne11; ++ic) { - vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); - } - } - - //int64_t t1 = wsp_ggml_time_us(); - //static int64_t acc = 0; - //acc += t1 - t0; - //if (t1 - t0 > 10) { - // printf("\n"); - // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); - // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); - // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); - - // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); - //} -} - -static void wsp_ggml_compute_forward_mul_mat( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - switch (src0->type) { - case WSP_GGML_TYPE_Q4_0: - case WSP_GGML_TYPE_Q4_1: - case WSP_GGML_TYPE_Q5_0: - case WSP_GGML_TYPE_Q5_1: - case WSP_GGML_TYPE_Q8_0: - case WSP_GGML_TYPE_Q8_1: - case WSP_GGML_TYPE_Q2_K: - case WSP_GGML_TYPE_Q3_K: - case WSP_GGML_TYPE_Q4_K: - case WSP_GGML_TYPE_Q5_K: - case WSP_GGML_TYPE_Q6_K: - { - wsp_ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst); - } break; - case WSP_GGML_TYPE_F16: - { - wsp_ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst); - } break; - case WSP_GGML_TYPE_F32: - { - wsp_ggml_compute_forward_mul_mat_f32(params, src0, src1, dst); - } break; - default: - { - WSP_GGML_ASSERT(false); - } break; - } -} - -// wsp_ggml_compute_forward_out_prod - - -static void wsp_ggml_compute_forward_out_prod_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - int64_t t0 = wsp_ggml_perf_time_us(); +static void ggml_compute_forward_out_prod_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; - WSP_GGML_ASSERT(ne02 == ne12); - WSP_GGML_ASSERT(ne03 == ne13); - WSP_GGML_ASSERT(ne2 == ne12); - WSP_GGML_ASSERT(ne3 == ne13); + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); // we don't support permuted src0 or src1 - WSP_GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); // dst cannot be transposed or permuted - WSP_GGML_ASSERT(nb0 == sizeof(float)); - // WSP_GGML_ASSERT(nb0 <= nb1); - // WSP_GGML_ASSERT(nb1 <= nb2); - // WSP_GGML_ASSERT(nb2 <= nb3); + GGML_ASSERT(nb0 == sizeof(float)); + // GGML_ASSERT(nb0 <= nb1); + // GGML_ASSERT(nb1 <= nb2); + // GGML_ASSERT(nb2 <= nb3); - WSP_GGML_ASSERT(ne0 == ne00); - WSP_GGML_ASSERT(ne1 == ne10); - WSP_GGML_ASSERT(ne2 == ne02); - WSP_GGML_ASSERT(ne3 == ne03); + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // TODO: #if defined(WSP_GGML_USE_CUBLAS) wsp_ggml_cuda_out_prod - // TODO: #if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS) || defined(WSP_GGML_USE_CLBLAST) + // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod + // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) - if (params->type == WSP_GGML_TASK_INIT) { - wsp_ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + if (params->type == GGML_TASK_INIT) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); return; } - if (params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_FINALIZE) { return; } @@ -11184,14 +11524,14 @@ static void wsp_ggml_compute_forward_out_prod_f32( float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13)); float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); - wsp_ggml_vec_mad_f32(ne0, d, s0, *s1); + ggml_vec_mad_f32(ne0, d, s0, *s1); // for (int64_t i0 = 0; i0 < ne0; ++i0) { // d[i0] += s0[i0] * s1[i1]; // } } } - //int64_t t1 = wsp_ggml_perf_time_us(); + //int64_t t1 = ggml_perf_time_us(); //static int64_t acc = 0; //acc += t1 - t0; //if (t1 - t0 > 10) { @@ -11205,51 +11545,51 @@ static void wsp_ggml_compute_forward_out_prod_f32( //} } -static void wsp_ggml_compute_forward_out_prod( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_out_prod( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_Q4_0: - case WSP_GGML_TYPE_Q4_1: - case WSP_GGML_TYPE_Q5_0: - case WSP_GGML_TYPE_Q5_1: - case WSP_GGML_TYPE_Q8_0: - case WSP_GGML_TYPE_Q8_1: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: { - WSP_GGML_ASSERT(false); // todo - // wsp_ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); + GGML_ASSERT(false); // todo + // ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); } break; - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - WSP_GGML_ASSERT(false); // todo - // wsp_ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst); + GGML_ASSERT(false); // todo + // ggml_compute_forward_out_prod_f16_f32(params, src0, src1, dst); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_out_prod_f32(params, src0, src1, dst); + ggml_compute_forward_out_prod_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_scale +// ggml_compute_forward_scale -static void wsp_ggml_compute_forward_scale_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst)); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); - WSP_GGML_ASSERT(wsp_ggml_is_scalar(src1)); +static void ggml_compute_forward_scale_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -11260,7 +11600,7 @@ static void wsp_ggml_compute_forward_scale_f32( const int nth = params->nth; const int nc = src0->ne[0]; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -11279,82 +11619,78 @@ static void wsp_ggml_compute_forward_scale_f32( // src0 is same shape as dst => same indices memcpy((char *)dst->data + i1*nb1, (char *)src0->data + i1*nb01, nc * sizeof(float)); } - wsp_ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v); + ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*nb1), v); } } -static void wsp_ggml_compute_forward_scale( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_scale( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_scale_f32(params, src0, src1, dst); + ggml_compute_forward_scale_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_set +// ggml_compute_forward_set -static void wsp_ggml_compute_forward_set_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst) && wsp_ggml_is_contiguous(src0)); - - WSP_GGML_ASSERT(opt0->type == WSP_GGML_TYPE_I32); - WSP_GGML_ASSERT(wsp_ggml_nelements(opt0) == 5); +static void ggml_compute_forward_set_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); // view src0 and dst with these strides and data offset inbytes during set // nb0 is implicitely element_size because src0 and dst are contiguous - size_t nb1 = ((int32_t *) opt0->data)[0]; - size_t nb2 = ((int32_t *) opt0->data)[1]; - size_t nb3 = ((int32_t *) opt0->data)[2]; - size_t offset = ((int32_t *) opt0->data)[3]; - bool inplace = (bool) ((int32_t *) opt0->data)[4]; + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - if (!inplace && (params->type == WSP_GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_TASK_INIT)) { // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase memcpy( ((char *) dst->data), ((char *) src0->data), - wsp_ggml_nbytes(dst)); + ggml_nbytes(dst)); } - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; - const int nr = wsp_ggml_nrows(src1); + const int nr = ggml_nrows(src1); const int nc = src1->ne[0]; - WSP_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); // src0 and dst as viewed during set - const size_t nb0 = wsp_ggml_element_size(src0); + const size_t nb0 = ggml_element_size(src0); const int im0 = (ne10 == 0 ? 0 : ne10-1); const int im1 = (ne11 == 0 ? 0 : ne11-1); const int im2 = (ne12 == 0 ? 0 : ne12-1); const int im3 = (ne13 == 0 ? 0 : ne13-1); - WSP_GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= wsp_ggml_nbytes(dst)); + GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst)); - WSP_GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -11370,124 +11706,123 @@ static void wsp_ggml_compute_forward_set_f32( const int i2 = (ir - i3*ne12*ne11)/ne11; const int i1 = (ir - i3*ne12*ne11 - i2*ne11); - wsp_ggml_vec_cpy_f32(nc, + ggml_vec_cpy_f32(nc, (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); } } -static void wsp_ggml_compute_forward_set( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_set( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: - { - wsp_ggml_compute_forward_set_f32(params, src0, src1, opt0, dst); - } break; - case WSP_GGML_TYPE_F16: - case WSP_GGML_TYPE_Q4_0: - case WSP_GGML_TYPE_Q4_1: - case WSP_GGML_TYPE_Q5_0: - case WSP_GGML_TYPE_Q5_1: - case WSP_GGML_TYPE_Q8_0: - case WSP_GGML_TYPE_Q8_1: - case WSP_GGML_TYPE_Q2_K: - case WSP_GGML_TYPE_Q3_K: - case WSP_GGML_TYPE_Q4_K: - case WSP_GGML_TYPE_Q5_K: - case WSP_GGML_TYPE_Q6_K: + case GGML_TYPE_F32: + { + ggml_compute_forward_set_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_cpy +// ggml_compute_forward_cpy -static void wsp_ggml_compute_forward_cpy( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - wsp_ggml_compute_forward_dup(params, src0, dst); +static void ggml_compute_forward_cpy( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + ggml_compute_forward_dup(params, src0, dst); } -// wsp_ggml_compute_forward_cont +// ggml_compute_forward_cont -static void wsp_ggml_compute_forward_cont( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - wsp_ggml_compute_forward_dup(params, src0, dst); +static void ggml_compute_forward_cont( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + ggml_compute_forward_dup(params, src0, dst); } -// wsp_ggml_compute_forward_reshape +// ggml_compute_forward_reshape -static void wsp_ggml_compute_forward_reshape( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_reshape( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { // NOP UNUSED(params); UNUSED(src0); UNUSED(dst); } -// wsp_ggml_compute_forward_view +// ggml_compute_forward_view -static void wsp_ggml_compute_forward_view( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0) { +static void ggml_compute_forward_view( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0) { // NOP UNUSED(params); UNUSED(src0); } -// wsp_ggml_compute_forward_permute +// ggml_compute_forward_permute -static void wsp_ggml_compute_forward_permute( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0) { +static void ggml_compute_forward_permute( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0) { // NOP UNUSED(params); UNUSED(src0); } -// wsp_ggml_compute_forward_transpose +// ggml_compute_forward_transpose -static void wsp_ggml_compute_forward_transpose( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0) { +static void ggml_compute_forward_transpose( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0) { // NOP UNUSED(params); UNUSED(src0); } -// wsp_ggml_compute_forward_get_rows +// ggml_compute_forward_get_rows -static void wsp_ggml_compute_forward_get_rows_q( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_get_rows_q( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { assert(params->ith == 0); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } const int nc = src0->ne[0]; - const int nr = wsp_ggml_nelements(src1); - const enum wsp_ggml_type type = src0->type; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; + const int nr = ggml_nelements(src1); + const enum ggml_type type = src0->type; + ggml_to_float_t const dequantize_row_q = type_traits[type].to_float; assert( dst->ne[0] == nc); assert( dst->ne[1] == nr); - assert(src0->nb[0] == WSP_GGML_TYPE_SIZE[type]); + assert(src0->nb[0] == ggml_type_size(type)); for (int i = 0; i < nr; ++i) { const int r = ((int32_t *) src1->data)[i]; @@ -11498,47 +11833,47 @@ static void wsp_ggml_compute_forward_get_rows_q( } } -static void wsp_ggml_compute_forward_get_rows_f16( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_get_rows_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { assert(params->ith == 0); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } const int nc = src0->ne[0]; - const int nr = wsp_ggml_nelements(src1); + const int nr = ggml_nelements(src1); assert( dst->ne[0] == nc); assert( dst->ne[1] == nr); - assert(src0->nb[0] == sizeof(wsp_ggml_fp16_t)); + assert(src0->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < nr; ++i) { const int r = ((int32_t *) src1->data)[i]; for (int j = 0; j < nc; ++j) { - wsp_ggml_fp16_t v = ((wsp_ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j]; - ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = WSP_GGML_FP16_TO_FP32(v); + ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j]; + ((float *) ((char *) dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v); } } } -static void wsp_ggml_compute_forward_get_rows_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_get_rows_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { assert(params->ith == 0); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } const int nc = src0->ne[0]; - const int nr = wsp_ggml_nelements(src1); + const int nr = ggml_nelements(src1); assert( dst->ne[0] == nc); assert( dst->ne[1] == nr); @@ -11547,43 +11882,43 @@ static void wsp_ggml_compute_forward_get_rows_f32( for (int i = 0; i < nr; ++i) { const int r = ((int32_t *) src1->data)[i]; - wsp_ggml_vec_cpy_f32(nc, + ggml_vec_cpy_f32(nc, (float *) ((char *) dst->data + i*dst->nb[1]), (float *) ((char *) src0->data + r*src0->nb[1])); } } -static void wsp_ggml_compute_forward_get_rows( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_get_rows( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_Q4_0: - case WSP_GGML_TYPE_Q4_1: - case WSP_GGML_TYPE_Q5_0: - case WSP_GGML_TYPE_Q5_1: - case WSP_GGML_TYPE_Q8_0: - case WSP_GGML_TYPE_Q8_1: - case WSP_GGML_TYPE_Q2_K: - case WSP_GGML_TYPE_Q3_K: - case WSP_GGML_TYPE_Q4_K: - case WSP_GGML_TYPE_Q5_K: - case WSP_GGML_TYPE_Q6_K: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: { - wsp_ggml_compute_forward_get_rows_q(params, src0, src1, dst); + ggml_compute_forward_get_rows_q(params, src0, src1, dst); } break; - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - wsp_ggml_compute_forward_get_rows_f16(params, src0, src1, dst); + ggml_compute_forward_get_rows_f16(params, src0, src1, dst); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_get_rows_f32(params, src0, src1, dst); + ggml_compute_forward_get_rows_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } @@ -11606,72 +11941,72 @@ static void wsp_ggml_compute_forward_get_rows( //} } -// wsp_ggml_compute_forward_get_rows_back +// ggml_compute_forward_get_rows_back -static void wsp_ggml_compute_forward_get_rows_back_f32_f16( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(params->ith == 0); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(opt0, dst)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(opt0)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst)); +static void ggml_compute_forward_get_rows_back_f32_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_are_same_shape(opt0, dst)); + GGML_ASSERT(ggml_is_contiguous(opt0)); + GGML_ASSERT(ggml_is_contiguous(dst)); - wsp_ggml_compute_forward_dup_same_cont(params, opt0, dst); + ggml_compute_forward_dup_same_cont(params, opt0, dst); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } const int nc = src0->ne[0]; - const int nr = wsp_ggml_nelements(src1); + const int nr = ggml_nelements(src1); - WSP_GGML_ASSERT( dst->ne[0] == nc); - WSP_GGML_ASSERT(src0->nb[0] == sizeof(wsp_ggml_fp16_t)); + GGML_ASSERT( dst->ne[0] == nc); + GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < nr; ++i) { const int r = ((int32_t *) src1->data)[i]; for (int j = 0; j < nc; ++j) { - wsp_ggml_fp16_t v = ((wsp_ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j]; - ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += WSP_GGML_FP16_TO_FP32(v); + ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j]; + ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v); } } } -static void wsp_ggml_compute_forward_get_rows_back_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(params->ith == 0); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(opt0, dst)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(opt0)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst)); +static void ggml_compute_forward_get_rows_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_are_same_shape(opt0, dst)); + GGML_ASSERT(ggml_is_contiguous(opt0)); + GGML_ASSERT(ggml_is_contiguous(dst)); - // wsp_ggml_compute_forward_dup_same_cont(params, opt0, dst); + // ggml_compute_forward_dup_same_cont(params, opt0, dst); - if (params->type == WSP_GGML_TASK_INIT) { - memset(dst->data, 0, wsp_ggml_nbytes(dst)); + if (params->type == GGML_TASK_INIT) { + memset(dst->data, 0, ggml_nbytes(dst)); } - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } const int nc = src0->ne[0]; - const int nr = wsp_ggml_nelements(src1); + const int nr = ggml_nelements(src1); - WSP_GGML_ASSERT( dst->ne[0] == nc); - WSP_GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT( dst->ne[0] == nc); + GGML_ASSERT(src0->nb[0] == sizeof(float)); for (int i = 0; i < nr; ++i) { const int r = ((int32_t *) src1->data)[i]; - wsp_ggml_vec_add_f32(nc, + ggml_vec_add_f32(nc, (float *) ((char *) dst->data + r*dst->nb[1]), (float *) ((char *) dst->data + r*dst->nb[1]), (float *) ((char *) src0->data + i*src0->nb[1])); @@ -11679,24 +12014,24 @@ static void wsp_ggml_compute_forward_get_rows_back_f32( } -static void wsp_ggml_compute_forward_get_rows_back( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_get_rows_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - wsp_ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, opt0, dst); + ggml_compute_forward_get_rows_back_f32_f16(params, src0, src1, opt0, dst); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_get_rows_back_f32(params, src0, src1, opt0, dst); + ggml_compute_forward_get_rows_back_f32(params, src0, src1, opt0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } @@ -11719,30 +12054,30 @@ static void wsp_ggml_compute_forward_get_rows_back( //} } -// wsp_ggml_compute_forward_diag +// ggml_compute_forward_diag -static void wsp_ggml_compute_forward_diag_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(params->ith == 0); +static void ggml_compute_forward_diag_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(params->ith == 0); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } // TODO: handle transposed/permuted matrices - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; - WSP_GGML_ASSERT(ne00 == ne0); - WSP_GGML_ASSERT(ne00 == ne1); - WSP_GGML_ASSERT(ne01 == 1); - WSP_GGML_ASSERT(ne02 == ne2); - WSP_GGML_ASSERT(ne03 == ne3); + GGML_ASSERT(ne00 == ne0); + GGML_ASSERT(ne00 == ne1); + GGML_ASSERT(ne01 == 1); + GGML_ASSERT(ne02 == ne2); + GGML_ASSERT(ne03 == ne3); - WSP_GGML_ASSERT(nb00 == sizeof(float)); - WSP_GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); for (int i3 = 0; i3 < ne3; i3++) { for (int i2 = 0; i2 < ne2; i2++) { @@ -11761,65 +12096,62 @@ static void wsp_ggml_compute_forward_diag_f32( } } -static void wsp_ggml_compute_forward_diag( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_diag( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_diag_f32(params, src0, dst); + ggml_compute_forward_diag_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_diag_mask_inf +// ggml_compute_forward_diag_mask_inf -static void wsp_ggml_compute_forward_diag_mask_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst, +static void ggml_compute_forward_diag_mask_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst, const float value) { - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_I32); - WSP_GGML_ASSERT(wsp_ggml_nelements(src1) == 2); const int ith = params->ith; const int nth = params->nth; - const int n_past = ((int32_t *) src1->data)[0]; - const bool inplace = (bool)((int32_t *) src1->data)[1]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const bool inplace = src0->data == dst->data; - WSP_GGML_ASSERT(n_past >= 0); + GGML_ASSERT(n_past >= 0); - if (!inplace && (params->type == WSP_GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_TASK_INIT)) { // memcpy needs to be synchronized across threads to avoid race conditions. // => do it in INIT phase - WSP_GGML_ASSERT(wsp_ggml_nelements(dst) == wsp_ggml_nelements(src0)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst) && wsp_ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); memcpy( ((char *) dst->data), ((char *) src0->data), - wsp_ggml_nbytes(dst)); + ggml_nbytes(dst)); } - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } // TODO: handle transposed/permuted matrices - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; const int nr = src0->ne[1]; const int nz = n/nr; - WSP_GGML_ASSERT( dst->nb[0] == sizeof(float)); - WSP_GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT( dst->nb[0] == sizeof(float)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); for (int k = 0; k < nz; k++) { for (int j = ith; j < nr; j += nth) { @@ -11832,51 +12164,49 @@ static void wsp_ggml_compute_forward_diag_mask_f32( } } -static void wsp_ggml_compute_forward_diag_mask_inf( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_diag_mask_inf( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY); + ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -static void wsp_ggml_compute_forward_diag_mask_zero( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_diag_mask_zero( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, 0); + ggml_compute_forward_diag_mask_f32(params, src0, dst, 0); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_soft_max +// ggml_compute_forward_soft_max -static void wsp_ggml_compute_forward_soft_max_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst)); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); +static void ggml_compute_forward_soft_max_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -11886,7 +12216,7 @@ static void wsp_ggml_compute_forward_soft_max_f32( const int nth = params->nth; const int nc = src0->ne[0]; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -11907,9 +12237,9 @@ static void wsp_ggml_compute_forward_soft_max_f32( #endif float max = -INFINITY; - wsp_ggml_vec_max_f32(nc, &max, sp); + ggml_vec_max_f32(nc, &max, sp); - wsp_ggml_float sum = 0.0; + ggml_float sum = 0.0; uint16_t scvt; for (int i = 0; i < nc; i++) { @@ -11917,10 +12247,10 @@ static void wsp_ggml_compute_forward_soft_max_f32( dp[i] = 0.0f; } else { // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max); - wsp_ggml_fp16_t s = WSP_GGML_FP32_TO_FP16(sp[i] - max); + ggml_fp16_t s = GGML_FP32_TO_FP16(sp[i] - max); memcpy(&scvt, &s, sizeof(scvt)); - const float val = WSP_GGML_FP16_TO_FP32(table_exp_f16[scvt]); - sum += (wsp_ggml_float)val; + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); + sum += (ggml_float)val; dp[i] = val; } } @@ -11928,7 +12258,7 @@ static void wsp_ggml_compute_forward_soft_max_f32( assert(sum > 0.0); sum = 1.0/sum; - wsp_ggml_vec_scale_f32(nc, dp, sum); + ggml_vec_scale_f32(nc, dp, sum); #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -11939,36 +12269,36 @@ static void wsp_ggml_compute_forward_soft_max_f32( } } -static void wsp_ggml_compute_forward_soft_max( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_soft_max( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_soft_max_f32(params, src0, dst); + ggml_compute_forward_soft_max_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_soft_max_back +// ggml_compute_forward_soft_max_back -static void wsp_ggml_compute_forward_soft_max_back_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src1)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst)); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src1, dst)); +static void ggml_compute_forward_soft_max_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_are_same_shape(src1, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -11978,7 +12308,7 @@ static void wsp_ggml_compute_forward_soft_max_back_f32( const int nth = params->nth; const int nc = src0->ne[0]; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -12005,6 +12335,7 @@ static void wsp_ggml_compute_forward_soft_max_back_f32( // dx = J * dy // dxk = sum_i(Jki * dyi) // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk + // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk // dxk = sum_i(-yk*yi * dyi) + yk*dyk // dxk = -yk * sum_i(yi * dyi) + yk*dyk // dxk = -yk * dot(y, dy) + yk*dyk @@ -12019,10 +12350,10 @@ static void wsp_ggml_compute_forward_soft_max_back_f32( // linear runtime, no additional memory float dot_y_dy = 0; - wsp_ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy); - wsp_ggml_vec_cpy_f32 (nc, dx, dy); - wsp_ggml_vec_acc1_f32(nc, dx, -dot_y_dy); - wsp_ggml_vec_mul_f32 (nc, dx, dx, y); + ggml_vec_dot_f32 (nc, &dot_y_dy, y, dy); + ggml_vec_cpy_f32 (nc, dx, dy); + ggml_vec_acc1_f32(nc, dx, -dot_y_dy); + ggml_vec_mul_f32 (nc, dx, dx, y); #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -12033,51 +12364,48 @@ static void wsp_ggml_compute_forward_soft_max_back_f32( } } -static void wsp_ggml_compute_forward_soft_max_back( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_soft_max_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst); + ggml_compute_forward_soft_max_back_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_alibi +// ggml_compute_forward_alibi -static void wsp_ggml_compute_forward_alibi_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_alibi_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_I32); - WSP_GGML_ASSERT(wsp_ggml_nelements(src1) == 3); - - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n_past = ((int32_t *) src1->data)[0]; - const int n_head = ((int32_t *) src1->data)[1]; - const float max_bias = ((float *) src1->data)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); assert(n_past >= 0); const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 const int ne1 = src0->ne[1]; // seq_len_without_past - //const int ne2 = src0->ne[2]; // n_head -> this is k + const int ne2 = src0->ne[2]; // n_head -> this is k //const int ne3 = src0->ne[3]; // 1 -> bsz - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int ne2_ne3 = n/ne1; // ne2*ne3 const int nb0 = src0->nb[0]; @@ -12085,8 +12413,9 @@ static void wsp_ggml_compute_forward_alibi_f32( const int nb2 = src0->nb[2]; //const int nb3 = src0->nb[3]; - assert(nb0 == sizeof(float)); - assert(ne1 + n_past == ne0); (void) n_past; + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(ne1 + n_past == ne0); + GGML_ASSERT(n_head == ne2); // add alibi to src0 (KQ_scaled) const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); @@ -12110,39 +12439,36 @@ static void wsp_ggml_compute_forward_alibi_f32( m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); } - pdst[0] = (i-ne0+1) * m_k + src[0]; + pdst[0] = i * m_k + src[0]; } } } } -static void wsp_ggml_compute_forward_alibi_f16( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_alibi_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_I32); - WSP_GGML_ASSERT(wsp_ggml_nelements(src1) == 3); - - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n_past = ((int32_t *) src1->data)[0]; - const int n_head = ((int32_t *) src1->data)[1]; - const float max_bias = ((float *) src1->data)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); assert(n_past >= 0); const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 const int ne1 = src0->ne[1]; // seq_len_without_past - //const int ne2 = src0->ne[2]; // n_head -> this is k + const int ne2 = src0->ne[2]; // n_head -> this is k //const int ne3 = src0->ne[3]; // 1 -> bsz - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int ne2_ne3 = n/ne1; // ne2*ne3 const int nb0 = src0->nb[0]; @@ -12150,8 +12476,9 @@ static void wsp_ggml_compute_forward_alibi_f16( const int nb2 = src0->nb[2]; //const int nb3 = src0->nb[3]; - assert(nb0 == sizeof(wsp_ggml_fp16_t)); - assert(ne1 + n_past == ne0); (void) n_past; + GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(ne1 + n_past == ne0); (void) n_past; + GGML_ASSERT(n_head == ne2); // add alibi to src0 (KQ_scaled) const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); @@ -12162,7 +12489,7 @@ static void wsp_ggml_compute_forward_alibi_f16( for (int i = 0; i < ne0; i++) { for (int j = 0; j < ne1; j++) { for (int k = 0; k < ne2_ne3; k++) { - wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); + ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); // TODO: k*nb2 or k*nb3 @@ -12176,72 +12503,68 @@ static void wsp_ggml_compute_forward_alibi_f16( } // we return F32 - pdst[0] = (i-ne0+1) * m_k + WSP_GGML_FP16_TO_FP32(src[0]); + pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]); } } } } -static void wsp_ggml_compute_forward_alibi( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_alibi( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - wsp_ggml_compute_forward_alibi_f16(params, src0, src1, dst); + ggml_compute_forward_alibi_f16(params, src0, dst); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_alibi_f32(params, src0, src1, dst); + ggml_compute_forward_alibi_f32(params, src0, dst); } break; - case WSP_GGML_TYPE_Q4_0: - case WSP_GGML_TYPE_Q4_1: - case WSP_GGML_TYPE_Q5_0: - case WSP_GGML_TYPE_Q5_1: - case WSP_GGML_TYPE_Q8_0: - case WSP_GGML_TYPE_Q8_1: - case WSP_GGML_TYPE_Q2_K: - case WSP_GGML_TYPE_Q3_K: - case WSP_GGML_TYPE_Q4_K: - case WSP_GGML_TYPE_Q5_K: - case WSP_GGML_TYPE_Q6_K: - case WSP_GGML_TYPE_Q8_K: - case WSP_GGML_TYPE_I8: - case WSP_GGML_TYPE_I16: - case WSP_GGML_TYPE_I32: - case WSP_GGML_TYPE_COUNT: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q8_K: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_COUNT: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } +// ggml_compute_forward_clamp -// wsp_ggml_compute_forward_clamp - -static void wsp_ggml_compute_forward_clamp_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_clamp_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { assert(params->ith == 0); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32); - WSP_GGML_ASSERT(wsp_ggml_nelements(src1) == 2); - - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const float min = ((float *) src1->data)[0]; - const float max = ((float *) src1->data)[1]; + float min; + float max; + memcpy(&min, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); const int ith = params->ith; const int nth = params->nth; - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; const size_t nb00 = src0->nb[0]; @@ -12250,8 +12573,8 @@ static void wsp_ggml_compute_forward_clamp_f32( const size_t nb0 = dst->nb[0]; const size_t nb1 = dst->nb[1]; - WSP_GGML_ASSERT( nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT( nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); for (int j = ith; j < n; j += nth) { float * dst_ptr = (float *) ((char *) dst->data + j*nb1); @@ -12263,74 +12586,81 @@ static void wsp_ggml_compute_forward_clamp_f32( } } -static void wsp_ggml_compute_forward_clamp( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_clamp( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_clamp_f32(params, src0, src1, dst); + ggml_compute_forward_clamp_f32(params, src0, dst); } break; - case WSP_GGML_TYPE_F16: - case WSP_GGML_TYPE_Q4_0: - case WSP_GGML_TYPE_Q4_1: - case WSP_GGML_TYPE_Q5_0: - case WSP_GGML_TYPE_Q5_1: - case WSP_GGML_TYPE_Q8_0: - case WSP_GGML_TYPE_Q8_1: - case WSP_GGML_TYPE_Q2_K: - case WSP_GGML_TYPE_Q3_K: - case WSP_GGML_TYPE_Q4_K: - case WSP_GGML_TYPE_Q5_K: - case WSP_GGML_TYPE_Q6_K: - case WSP_GGML_TYPE_Q8_K: - case WSP_GGML_TYPE_I8: - case WSP_GGML_TYPE_I16: - case WSP_GGML_TYPE_I32: - case WSP_GGML_TYPE_COUNT: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q8_K: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_COUNT: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_rope +// ggml_compute_forward_rope -static void wsp_ggml_compute_forward_rope_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_I32); - WSP_GGML_ASSERT(wsp_ggml_nelements(src1) == 4); +static void ggml_compute_forward_rope_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; + float freq_base; + float freq_scale; + + // these two only relevant for xPos RoPE: + float xpos_base; + bool xpos_down; + + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); assert(n_past >= 0); - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - WSP_GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); const int ith = params->ith; const int nth = params->nth; - const int nr = wsp_ggml_nrows(dst); + const int nr = ggml_nrows(dst); - WSP_GGML_ASSERT(n_dims <= ne0); - WSP_GGML_ASSERT(n_dims % 2 == 0); + GGML_ASSERT(n_dims <= ne0); + GGML_ASSERT(n_dims % 2 == 0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -12342,7 +12672,7 @@ static void wsp_ggml_compute_forward_rope_f32( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f/n_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12354,7 +12684,7 @@ static void wsp_ggml_compute_forward_rope_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = freq_scale * (float)p; if (is_glm) { theta = MIN(p, n_ctx - 2); @@ -12385,6 +12715,9 @@ static void wsp_ggml_compute_forward_rope_f32( for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); + // zeta scaling for xPos only: + float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f; + if (xpos_down) zeta = 1.0f / zeta; theta *= theta_scale; @@ -12394,11 +12727,11 @@ static void wsp_ggml_compute_forward_rope_f32( const float x0 = src[0]; const float x1 = src[1]; - dst_data[0] = x0*cos_theta - x1*sin_theta; - dst_data[1] = x0*sin_theta + x1*cos_theta; + dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta; + dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta; } } else { - // TODO: this is probably wrong, but I can't figure it out .. + // TODO: this might be wrong for ne0 != n_dims - need double check // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { @@ -12425,39 +12758,41 @@ static void wsp_ggml_compute_forward_rope_f32( } } -static void wsp_ggml_compute_forward_rope_f16( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_I32); - WSP_GGML_ASSERT(wsp_ggml_nelements(src1) == 4); +static void ggml_compute_forward_rope_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; + float freq_base; + float freq_scale; + + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); assert(n_past >= 0); - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - WSP_GGML_ASSERT(nb0 == sizeof(wsp_ggml_fp16_t)); + GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); const int ith = params->ith; const int nth = params->nth; - const int nr = wsp_ggml_nrows(dst); + const int nr = ggml_nrows(dst); - WSP_GGML_ASSERT(n_dims <= ne0); - WSP_GGML_ASSERT(n_dims % 2 == 0); + GGML_ASSERT(n_dims <= ne0); + GGML_ASSERT(n_dims % 2 == 0); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -12469,7 +12804,7 @@ static void wsp_ggml_compute_forward_rope_f16( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f/n_dims); const bool is_neox = mode & 2; const bool is_glm = mode & 4; @@ -12481,7 +12816,7 @@ static void wsp_ggml_compute_forward_rope_f16( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = freq_scale * (float)p; if (is_glm) { theta = MIN(p, n_ctx - 2); @@ -12495,18 +12830,18 @@ static void wsp_ggml_compute_forward_rope_f16( theta *= theta_scale; block_theta *= theta_scale; - const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - wsp_ggml_fp16_t * dst_data = (wsp_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = WSP_GGML_FP16_TO_FP32(src[0]); - const float x1 = WSP_GGML_FP16_TO_FP32(src[n_dims/2]); - const float x2 = WSP_GGML_FP16_TO_FP32(src[n_dims]); - const float x3 = WSP_GGML_FP16_TO_FP32(src[n_dims/2*3]); + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); + const float x2 = GGML_FP16_TO_FP32(src[n_dims]); + const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]); - dst_data[0] = WSP_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims/2] = WSP_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); - dst_data[n_dims] = WSP_GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta); - dst_data[n_dims/2*3] = WSP_GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta); + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta); + dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta); } } if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { @@ -12515,17 +12850,17 @@ static void wsp_ggml_compute_forward_rope_f16( theta *= theta_scale; - const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - wsp_ggml_fp16_t * dst_data = (wsp_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = WSP_GGML_FP16_TO_FP32(src[0]); - const float x1 = WSP_GGML_FP16_TO_FP32(src[1]); + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[1]); - dst_data[0] = WSP_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[1] = WSP_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } else { - // TODO: this is probably wrong, but I can't figure it out .. + // TODO: this might be wrong for ne0 != n_dims - need double check // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ic = 0; ic < n_dims; ic += 2) { @@ -12536,14 +12871,14 @@ static void wsp_ggml_compute_forward_rope_f16( const int64_t i0 = ib*n_dims + ic/2; - const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - wsp_ggml_fp16_t * dst_data = (wsp_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = WSP_GGML_FP16_TO_FP32(src[0]); - const float x1 = WSP_GGML_FP16_TO_FP32(src[n_dims/2]); + const float x0 = GGML_FP16_TO_FP32(src[0]); + const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); - dst_data[0] = WSP_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims/2] = WSP_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } } @@ -12552,38 +12887,34 @@ static void wsp_ggml_compute_forward_rope_f16( } } -static void wsp_ggml_compute_forward_rope( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_rope( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - wsp_ggml_compute_forward_rope_f16(params, src0, src1, dst); + ggml_compute_forward_rope_f16(params, src0, dst); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_rope_f32(params, src0, src1, dst); + ggml_compute_forward_rope_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_rope_back +// ggml_compute_forward_rope_back -static void wsp_ggml_compute_forward_rope_back_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - assert(src1->type == WSP_GGML_TYPE_I32); - assert(wsp_ggml_nelements(src1) == 3); +static void ggml_compute_forward_rope_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -12591,13 +12922,25 @@ static void wsp_ggml_compute_forward_rope_back_f32( // dx = rope_back(dy, src1) // src0 is dy, src1 contains options - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + float freq_base; + float freq_scale; + + // these two only relevant for xPos RoPE: + float xpos_base; + bool xpos_down; + + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx); + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool)); assert(n_past >= 0); - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); @@ -12607,7 +12950,7 @@ static void wsp_ggml_compute_forward_rope_back_f32( const int ith = params->ith; const int nth = params->nth; - const int nr = wsp_ggml_nrows(dst); + const int nr = ggml_nrows(dst); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -12619,7 +12962,7 @@ static void wsp_ggml_compute_forward_rope_back_f32( // row index used to determine which thread to use int ir = 0; - const float theta_scale = powf(10000.0, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f/n_dims); const bool is_neox = mode & 2; @@ -12630,12 +12973,15 @@ static void wsp_ggml_compute_forward_rope_back_f32( if (ir++ < ir0) continue; if (ir > ir1) break; - float theta = (float)p; + float theta = freq_scale * (float)p; if (!is_neox) { for (int64_t i0 = 0; i0 < ne0; i0 += 2) { const float cos_theta = cosf(theta); const float sin_theta = sinf(theta); + // zeta scaling for xPos only: + float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), (n_past + i2) / xpos_base) : 1.0f; + if (xpos_down) zeta = 1.0f / zeta; theta *= theta_scale; @@ -12645,8 +12991,8 @@ static void wsp_ggml_compute_forward_rope_back_f32( const float dy0 = dy[0]; const float dy1 = dy[1]; - dx[0] = dy0*cos_theta + dy1*sin_theta; - dx[1] = - dy0*sin_theta + dy1*cos_theta; + dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta; + dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta; } } else { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { @@ -12674,15 +13020,12 @@ static void wsp_ggml_compute_forward_rope_back_f32( } } -static void wsp_ggml_compute_forward_rope_back_f16( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - assert(src1->type == WSP_GGML_TYPE_I32); - assert(wsp_ggml_nelements(src1) == 3); +static void ggml_compute_forward_rope_back_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } @@ -12690,23 +13033,23 @@ static void wsp_ggml_compute_forward_rope_back_f16( // dx = rope_back(dy, src1) // src0 is dy, src1 contains options - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; assert(n_past >= 0); - WSP_GGML_TENSOR_UNARY_OP_LOCALS; + GGML_TENSOR_UNARY_OP_LOCALS; //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3); //printf("n_past = %d, ne2 = %d\n", n_past, ne2); - assert(nb0 == sizeof(wsp_ggml_fp16_t)); + assert(nb0 == sizeof(ggml_fp16_t)); const int ith = params->ith; const int nth = params->nth; - const int nr = wsp_ggml_nrows(dst); + const int nr = ggml_nrows(dst); // rows per thread const int dr = (nr + nth - 1)/nth; @@ -12738,14 +13081,14 @@ static void wsp_ggml_compute_forward_rope_back_f16( theta *= theta_scale; - const wsp_ggml_fp16_t * const dy = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - wsp_ggml_fp16_t * dx = (wsp_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float dy0 = WSP_GGML_FP16_TO_FP32(dy[0]); - const float dy1 = WSP_GGML_FP16_TO_FP32(dy[1]); + const float dy0 = GGML_FP16_TO_FP32(dy[0]); + const float dy1 = GGML_FP16_TO_FP32(dy[1]); - dx[0] = WSP_GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); - dx[1] = WSP_GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); + dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); + dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); } } else { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { @@ -12757,14 +13100,14 @@ static void wsp_ggml_compute_forward_rope_back_f16( const int64_t i0 = ib*n_dims + ic/2; - const wsp_ggml_fp16_t * const dy = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); - wsp_ggml_fp16_t * dx = (wsp_ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); + ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float dy0 = WSP_GGML_FP16_TO_FP32(dy[0]); - const float dy1 = WSP_GGML_FP16_TO_FP32(dy[n_dims/2]); + const float dy0 = GGML_FP16_TO_FP32(dy[0]); + const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]); - dx[0] = WSP_GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); - dx[n_dims/2] = WSP_GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); + dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta); + dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta); } } } @@ -12773,42 +13116,41 @@ static void wsp_ggml_compute_forward_rope_back_f16( } } -static void wsp_ggml_compute_forward_rope_back( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_rope_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - wsp_ggml_compute_forward_rope_back_f16(params, src0, src1, dst); + ggml_compute_forward_rope_back_f16(params, src0, dst); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_rope_back_f32(params, src0, src1, dst); + ggml_compute_forward_rope_back_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_conv_1d +// ggml_compute_forward_conv_1d -static void wsp_ggml_compute_forward_conv_1d_s1_ph_f16_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32); - WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32); +static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); - int64_t t0 = wsp_ggml_perf_time_us(); + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -12816,24 +13158,24 @@ static void wsp_ggml_compute_forward_conv_1d_s1_ph_f16_f32( const int nk = ne00; const int nh = nk/2; - const int ew0 = wsp_ggml_up32(ne01); + const int ew0 = ggml_up32(ne01); - WSP_GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes - WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t)); - WSP_GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == WSP_GGML_TASK_INIT) { + if (params->type == GGML_TASK_INIT) { // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); // prepare kernel data (src0) { - wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + 0; + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); - wsp_ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); + ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; for (int64_t i00 = 0; i00 < ne00; i00++) { dst_data[i00*ew0 + i01] = src[i00]; } @@ -12843,13 +13185,13 @@ static void wsp_ggml_compute_forward_conv_1d_s1_ph_f16_f32( // prepare source data (src1) { - wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + ne02*ew0*ne00; + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00; for (int64_t i11 = 0; i11 < ne11; i11++) { const float * const src = (float *)((char *) src1->data + i11*nb11); - wsp_ggml_fp16_t * dst_data = wdata; + ggml_fp16_t * dst_data = wdata; for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[(i10 + nh)*ew0 + i11] = WSP_GGML_FP32_TO_FP16(src[i10]); + dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]); } } } @@ -12857,7 +13199,7 @@ static void wsp_ggml_compute_forward_conv_1d_s1_ph_f16_f32( return; } - if (params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_FINALIZE) { return; } @@ -12877,9 +13219,9 @@ static void wsp_ggml_compute_forward_conv_1d_s1_ph_f16_f32( dst_data[i0] = 0; for (int k = -nh; k <= nh; k++) { float v = 0.0f; - wsp_ggml_vec_dot_f16(ew0, &v, - (wsp_ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, - (wsp_ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + ggml_vec_dot_f16(ew0, &v, + (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); dst_data[i0] += v; } @@ -12887,19 +13229,19 @@ static void wsp_ggml_compute_forward_conv_1d_s1_ph_f16_f32( } } -static void wsp_ggml_compute_forward_conv_1d_s1_ph_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F32); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32); - WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32); +static void ggml_compute_forward_conv_1d_s1_ph_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); - int64_t t0 = wsp_ggml_perf_time_us(); + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -12907,13 +13249,13 @@ static void wsp_ggml_compute_forward_conv_1d_s1_ph_f32( const int nk = ne00; const int nh = nk/2; - const int ew0 = wsp_ggml_up32(ne01); + const int ew0 = ggml_up32(ne01); - WSP_GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes - WSP_GGML_ASSERT(nb00 == sizeof(float)); - WSP_GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == WSP_GGML_TASK_INIT) { + if (params->type == GGML_TASK_INIT) { // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); @@ -12948,7 +13290,7 @@ static void wsp_ggml_compute_forward_conv_1d_s1_ph_f32( return; } - if (params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_FINALIZE) { return; } @@ -12968,7 +13310,7 @@ static void wsp_ggml_compute_forward_conv_1d_s1_ph_f32( dst_data[i0] = 0; for (int k = -nh; k <= nh; k++) { float v = 0.0f; - wsp_ggml_vec_dot_f32(ew0, &v, + ggml_vec_dot_f32(ew0, &v, (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); @@ -12978,40 +13320,40 @@ static void wsp_ggml_compute_forward_conv_1d_s1_ph_f32( } } -static void wsp_ggml_compute_forward_conv_1d_s1_ph( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_conv_1d_s1_ph( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - wsp_ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -static void wsp_ggml_compute_forward_conv_1d_s2_ph_f16_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32); - WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32); +static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); - int64_t t0 = wsp_ggml_perf_time_us(); + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -13019,24 +13361,24 @@ static void wsp_ggml_compute_forward_conv_1d_s2_ph_f16_f32( const int nk = ne00; const int nh = nk/2; - const int ew0 = wsp_ggml_up32(ne01); + const int ew0 = ggml_up32(ne01); - WSP_GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes - WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t)); - WSP_GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == WSP_GGML_TASK_INIT) { + if (params->type == GGML_TASK_INIT) { // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); // prepare kernel data (src0) { - wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + 0; + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = 0; i01 < ne01; i01++) { - const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); - wsp_ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01); + ggml_fp16_t * dst_data = wdata + i02*ew0*ne00; for (int64_t i00 = 0; i00 < ne00; i00++) { dst_data[i00*ew0 + i01] = src[i00]; } @@ -13046,13 +13388,13 @@ static void wsp_ggml_compute_forward_conv_1d_s2_ph_f16_f32( // prepare source data (src1) { - wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + ne02*ew0*ne00; + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00; for (int64_t i11 = 0; i11 < ne11; i11++) { const float * const src = (float *)((char *) src1->data + i11*nb11); - wsp_ggml_fp16_t * dst_data = wdata; + ggml_fp16_t * dst_data = wdata; for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[(i10 + nh)*ew0 + i11] = WSP_GGML_FP32_TO_FP16(src[i10]); + dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]); } } } @@ -13060,7 +13402,7 @@ static void wsp_ggml_compute_forward_conv_1d_s2_ph_f16_f32( return; } - if (params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_FINALIZE) { return; } @@ -13080,9 +13422,9 @@ static void wsp_ggml_compute_forward_conv_1d_s2_ph_f16_f32( dst_data[i0/2] = 0; for (int k = -nh; k <= nh; k++) { float v = 0.0f; - wsp_ggml_vec_dot_f16(ew0, &v, - (wsp_ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, - (wsp_ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); + ggml_vec_dot_f16(ew0, &v, + (ggml_fp16_t *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, + (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); dst_data[i0/2] += v; } @@ -13090,19 +13432,19 @@ static void wsp_ggml_compute_forward_conv_1d_s2_ph_f16_f32( } } -static void wsp_ggml_compute_forward_conv_1d_s2_ph_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F32); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32); - WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32); +static void ggml_compute_forward_conv_1d_s2_ph_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); - int64_t t0 = wsp_ggml_perf_time_us(); + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -13110,13 +13452,13 @@ static void wsp_ggml_compute_forward_conv_1d_s2_ph_f32( const int nk = ne00; const int nh = nk/2; - const int ew0 = wsp_ggml_up32(ne01); + const int ew0 = ggml_up32(ne01); - WSP_GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes - WSP_GGML_ASSERT(nb00 == sizeof(float)); - WSP_GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == WSP_GGML_TASK_INIT) { + if (params->type == GGML_TASK_INIT) { // TODO: fix this memset (wsize is overestimated) memset(params->wdata, 0, params->wsize); @@ -13151,7 +13493,7 @@ static void wsp_ggml_compute_forward_conv_1d_s2_ph_f32( return; } - if (params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_FINALIZE) { return; } @@ -13171,7 +13513,7 @@ static void wsp_ggml_compute_forward_conv_1d_s2_ph_f32( dst_data[i0/2] = 0; for (int k = -nh; k <= nh; k++) { float v = 0.0f; - wsp_ggml_vec_dot_f32(ew0, &v, + ggml_vec_dot_f32(ew0, &v, (float *) params->wdata + i1*ew0*ne00 + (nh + k)*ew0, (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0); @@ -13181,64 +13523,63 @@ static void wsp_ggml_compute_forward_conv_1d_s2_ph_f32( } } -static void wsp_ggml_compute_forward_conv_1d_s2_ph( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_conv_1d_s2_ph( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - wsp_ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_conv_1d +// ggml_compute_forward_conv_1d -static void wsp_ggml_compute_forward_conv_1d( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { - const int32_t s0 = ((const int32_t*)(opt0->data))[0]; - const int32_t p0 = ((const int32_t*)(opt0->data))[1]; - const int32_t d0 = ((const int32_t*)(opt0->data))[2]; - WSP_GGML_ASSERT(d0 == 1); // dilation not supported - WSP_GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported +static void ggml_compute_forward_conv_1d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; + GGML_ASSERT(d0 == 1); // dilation not supported + GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported if (s0 == 1) { - wsp_ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s1_ph(params, src0, src1, dst); } else if (s0 == 2) { - wsp_ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s2_ph(params, src0, src1, dst); } else { - WSP_GGML_ASSERT(false); // only stride 1 and 2 supported + GGML_ASSERT(false); // only stride 1 and 2 supported }; } -// wsp_ggml_compute_forward_conv_2d_sk_p0 +// ggml_compute_forward_conv_2d -static void wsp_ggml_compute_forward_conv_2d_sk_p0_f16_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16); - WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32); - WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32); +static void ggml_compute_forward_conv_2d_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); - int64_t t0 = wsp_ggml_perf_time_us(); + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - WSP_GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; @@ -13249,27 +13590,38 @@ static void wsp_ggml_compute_forward_conv_2d_sk_p0_f16_f32( // size of the convolution row - the kernel size unrolled across all channels const int ew0 = nk0*nk1*ne02; - WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t)); - WSP_GGML_ASSERT(nb10 == sizeof(float)); + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; - if (params->type == WSP_GGML_TASK_INIT) { - // TODO: fix this memset (wsize is overestimated) + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { memset(params->wdata, 0, params->wsize); // prepare source data (src1) { - wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + 0; + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; for (int i12 = 0; i12 < ne12; i12++) { const float * const src = (float *)((char *) src1->data + i12*nb12); - wsp_ggml_fp16_t * dst_data = wdata; + ggml_fp16_t * dst_data = wdata; for (int i1 = 0; i1 < ne1; i1++) { for (int i0 = 0; i0 < ne0; i0++) { for (int ik1 = 0; ik1 < nk1; ik1++) { for (int ik0 = 0; ik0 < nk0; ik0++) { - dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = - WSP_GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]); + const int idx0 = i0*s0 + ik0*d0 - p0; + const int idx1 = i1*s1 + ik1*d1 - p1; + + if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { + dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = + GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]); + } } } } @@ -13280,7 +13632,7 @@ static void wsp_ggml_compute_forward_conv_2d_sk_p0_f16_f32( return; } - if (params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_FINALIZE) { return; } @@ -13294,236 +13646,527 @@ static void wsp_ggml_compute_forward_conv_2d_sk_p0_f16_f32( const int ip0 = dp*ith; const int ip1 = MIN(ip0 + dp, np); - wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + 0; - - for (int i2 = ip0; i2 < ip1; i2++) { - float * dst_data = (float *)((char *) dst->data + i2*nb2); + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - for (int i1 = 0; i1 < ne1; ++i1) { - for (int i0 = 0; i0 < ne0; ++i0) { - wsp_ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0, - (wsp_ggml_fp16_t *) ((char *) src0->data + i2*nb03), - (wsp_ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0); + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = ip0; i2 < ip1; i2++) { + float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2); + + for (int i1 = 0; i1 < ne1; ++i1) { + for (int i0 = 0; i0 < ne0; ++i0) { + ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0, + (ggml_fp16_t *) ((char *) src0->data + i2*nb03), + (ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0); + } } } } } -static void wsp_ggml_compute_forward_conv_2d_sk_p0( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_conv_2d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - wsp_ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - //wsp_ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst); - WSP_GGML_ASSERT(false); + //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst); + GGML_ASSERT(false); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_conv_2d - -static void wsp_ggml_compute_forward_conv_2d( - const struct wsp_ggml_compute_params* params, - const struct wsp_ggml_tensor* src0, - const struct wsp_ggml_tensor* src1, - const struct wsp_ggml_tensor* opt0, - struct wsp_ggml_tensor* dst) { - const int32_t s0 = ((const int32_t*)(opt0->data))[0]; - const int32_t s1 = ((const int32_t*)(opt0->data))[1]; - const int32_t p0 = ((const int32_t*)(opt0->data))[2]; - const int32_t p1 = ((const int32_t*)(opt0->data))[3]; - const int32_t d0 = ((const int32_t*)(opt0->data))[4]; - const int32_t d1 = ((const int32_t*)(opt0->data))[5]; - WSP_GGML_ASSERT(d0 == 1); // dilation not supported - WSP_GGML_ASSERT(d1 == 1); - WSP_GGML_ASSERT(p0 == 0); // padding not supported - WSP_GGML_ASSERT(p1 == 0); - - if (s0 == src0->ne[0] && s1 == src0->ne[1]) { - wsp_ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst); - } - else { - WSP_GGML_ASSERT(false); // only stride equal to kernel size is supported - }; -} - +// ggml_compute_forward_conv_transpose_2d -// wsp_ggml_compute_forward_flash_attn +static void ggml_compute_forward_conv_transpose_2d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); -static void wsp_ggml_compute_forward_flash_attn_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * q, - const struct wsp_ggml_tensor * k, - const struct wsp_ggml_tensor * v, - const bool masked, - struct wsp_ggml_tensor * dst) { - int64_t t0 = wsp_ggml_perf_time_us(); + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - WSP_GGML_TENSOR_LOCALS(int64_t, neq, q, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbq, q, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, nek, k, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbk, k, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, nev, v, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbv, v, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; const int nth = params->nth; - const int64_t D = neq0; - const int64_t N = neq1; - const int64_t P = nek1 - N; - const int64_t M = P + N; - - const int Mup = wsp_ggml_up(M, WSP_GGML_SOFT_MAX_UNROLL); + const int nk = ne00*ne01*ne02*ne03; - WSP_GGML_ASSERT(ne0 == D); - WSP_GGML_ASSERT(ne1 == N); - WSP_GGML_ASSERT(P >= 0); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); - WSP_GGML_ASSERT(nbq0 == sizeof(float)); - WSP_GGML_ASSERT(nbk0 == sizeof(float)); - WSP_GGML_ASSERT(nbv0 == sizeof(float)); - - WSP_GGML_ASSERT(neq0 == D); - WSP_GGML_ASSERT(nek0 == D); - WSP_GGML_ASSERT(nev1 == D); + if (params->type == GGML_TASK_INIT) { + memset(params->wdata, 0, params->wsize); - WSP_GGML_ASSERT(neq1 == N); - WSP_GGML_ASSERT(nek1 == N + P); - WSP_GGML_ASSERT(nev1 == D); + // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i03*nb03 + i02*nb02); + ggml_fp16_t * dst_data = wdata + i02*ne01*ne00*ne03; + for (int64_t i01 = 0; i01 < ne01; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + dst_data[i01*ne00*ne03 + i00*ne03 + i03] = src[i01 * ne00 + i00]; + } + } + } + } + } - // dst cannot be transposed or permuted - WSP_GGML_ASSERT(nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb0 <= nb1); - WSP_GGML_ASSERT(nb1 <= nb2); - WSP_GGML_ASSERT(nb2 <= nb3); + // permute source data (src1) from (Sw x Sh x Cin) to (Cin x Sw x Sh) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + nk; + for (int i12 = 0; i12 < ne12; i12++) { + for (int i11 = 0; i11 < ne11; i11++) { + const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); + ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; + for (int i10 = 0; i10 < ne10; i10++) { + dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]); + } + } + } + } - if (params->type == WSP_GGML_TASK_INIT) { return; } - if (params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_FINALIZE) { return; } - // parallelize by q rows using wsp_ggml_vec_dot_f32 + const int32_t stride = ggml_get_op_params_i32(dst, 0); - // total rows in q - const int nr = neq1*neq2*neq3; + // total patches in dst + const int np = ne2; - // rows per thread - const int dr = (nr + nth - 1)/nth; + // patches per thread + const int dp = (np + nth - 1)/nth; - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); - const float scale = 1.0f/sqrtf(D); + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + ggml_fp16_t * const wdata_src = wdata + nk; - //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + for (int i2 = ip0; i2 < ip1; i2++) { // Cout + float * dst_data = (float *)((char *) dst->data + i2*nb2); + ggml_fp16_t * wdata_kernel = wdata + i2*ne01*ne00*ne03; + for (int i11 = 0; i11 < ne11; i11++) { + for (int i10 = 0; i10 < ne10; i10++) { + const int i1n = i11*ne10*ne12 + i10*ne12; + for (int i01 = 0; i01 < ne01; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + float v = 0; + ggml_vec_dot_f16(ne03, &v, + wdata_src + i1n, + wdata_kernel + i01*ne00*ne03 + i00*ne03); + dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; + } + } + } + } + } +} - for (int ir = ir0; ir < ir1; ++ir) { - // q indices - const int iq3 = ir/(neq2*neq1); - const int iq2 = (ir - iq3*neq2*neq1)/neq1; - const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); +// ggml_compute_forward_pool_1d_sk_p0 - float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32); +static void ggml_compute_forward_pool_1d_sk_p0( + const struct ggml_compute_params * params, + const enum ggml_op_pool op, + const struct ggml_tensor * src, + const int k, + struct ggml_tensor * dst) { + assert(src->type == GGML_TYPE_F32); + assert(params->ith == 0); - for (int i = M; i < Mup; ++i) { - S[i] = -INFINITY; - } + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } - for (int64_t ic = 0; ic < nek1; ++ic) { - // k indices - const int ik3 = iq3; - const int ik2 = iq2; - const int ik1 = ic; + const char * cdata = (const char *)src->data; + const char * const data_end = cdata + ggml_nbytes(src); + float * drow = (float *)dst->data; - // S indices - const int i1 = ik1; + const int64_t rs = dst->ne[0]; - wsp_ggml_vec_dot_f32(neq0, - S + i1, - (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); - } + while (cdata < data_end) { + const float * const srow = (const float *)cdata; - // scale - wsp_ggml_vec_scale_f32(nek1, S, scale); + int j = 0; - if (masked) { - for (int64_t i = P; i < M; i++) { - if (i > P + iq1) { - S[i] = -INFINITY; + for (int64_t i = 0; i < rs; ++i) { + switch (op) { + case GGML_OP_POOL_AVG: drow[i] = 0; break; + case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + for (int ki = 0; ki < k; ++ki) { + switch (op) { + case GGML_OP_POOL_AVG: drow[i] += srow[j]; break; + case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; } + ++j; + } + switch (op) { + case GGML_OP_POOL_AVG: drow[i] /= k; break; + case GGML_OP_POOL_MAX: break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; } } - // softmax - { - float max = -INFINITY; - wsp_ggml_vec_max_f32(M, &max, S); + cdata += src->nb[1]; + drow += rs; + } +} - wsp_ggml_float sum = 0.0; - { -#ifdef WSP_GGML_SOFT_MAX_ACCELERATE - max = -max; - vDSP_vsadd(S, 1, &max, S, 1, Mup); - vvexpf(S, S, &Mup); - wsp_ggml_vec_sum_f32(Mup, &sum, S); -#else - uint16_t scvt[WSP_GGML_SOFT_MAX_UNROLL]; - wsp_ggml_float sump[WSP_GGML_SOFT_MAX_UNROLL] = { 0.0 }; +// ggml_compute_forward_pool_1d - for (int i = 0; i < Mup; i += WSP_GGML_SOFT_MAX_UNROLL) { - float * SS = S + i; +static void ggml_compute_forward_pool_1d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { - for (int j = 0; j < WSP_GGML_SOFT_MAX_UNROLL; ++j) { - if (SS[j] == -INFINITY) { - SS[j] = 0.0f; - } else { - wsp_ggml_fp16_t s = WSP_GGML_FP32_TO_FP16(SS[j] - max); - memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = WSP_GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); - sump[j] += (wsp_ggml_float)val; - SS[j] = val; + const int32_t * opts = (const int32_t *)dst->op_params; + enum ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int s0 = opts[2]; + const int p0 = opts[3]; + GGML_ASSERT(p0 == 0); // padding not supported + GGML_ASSERT(k0 == s0); // only s = k supported + + ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst); +} + +// ggml_compute_forward_pool_2d_sk_p0 + +static void ggml_compute_forward_pool_2d_sk_p0( + const struct ggml_compute_params * params, + const enum ggml_op_pool op, + const struct ggml_tensor * src, + const int k0, + const int k1, + struct ggml_tensor * dst) { + assert(src->type == GGML_TYPE_F32); + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const char * cdata = (const char*)src->data; + const char * const data_end = cdata + ggml_nbytes(src); + + const int64_t px = dst->ne[0]; + const int64_t py = dst->ne[1]; + const int64_t pa = px * py; + + float * dplane = (float *)dst->data; + + const int ka = k0 * k1; + + while (cdata < data_end) { + for (int oy = 0; oy < py; ++oy) { + float * const drow = dplane + oy * px; + for (int ox = 0; ox < px; ++ox) { + float * const out = drow + ox; + switch (op) { + case GGML_OP_POOL_AVG: *out = 0; break; + case GGML_OP_POOL_MAX: *out = -FLT_MAX; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + } + + const int ix = ox * k0; + const int iy = oy * k1; + + for (int ky = 0; ky < k1; ++ky) { + const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky)); + for (int kx = 0; kx < k0; ++kx) { + int j = ix + kx; + switch (op) { + case GGML_OP_POOL_AVG: *out += srow[j]; break; + case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; } } } - - for (int i = 0; i < WSP_GGML_SOFT_MAX_UNROLL; i++) { - sum += sump[i]; + switch (op) { + case GGML_OP_POOL_AVG: *out /= ka; break; + case GGML_OP_POOL_MAX: break; + case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; } -#endif } + } - assert(sum > 0.0); + cdata += src->nb[2]; + dplane += pa; + } +} - sum = 1.0/sum; - wsp_ggml_vec_scale_f32(M, S, sum); +// ggml_compute_forward_pool_2d -#ifndef NDEBUG - for (int i = 0; i < M; ++i) { - assert(!isnan(S[i])); - assert(!isinf(S[i])); - } -#endif - } +static void ggml_compute_forward_pool_2d( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + const int32_t * opts = (const int32_t *)dst->op_params; + enum ggml_op_pool op = opts[0]; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + GGML_ASSERT(p0 == 0); + GGML_ASSERT(p1 == 0); // padding not supported + GGML_ASSERT(k0 == s0); + GGML_ASSERT(k1 == s1); // only s = k supported + + ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst); +} + +// ggml_compute_forward_upscale + +static void ggml_compute_forward_upscale_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + + GGML_TENSOR_UNARY_OP_LOCALS; + + const int scale_factor = dst->op_params[0]; + + // TODO: optimize + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = ith; i02 < ne02; i02++) { + for (int m = 0; m < dst->ne[1]; m++) { + int i01 = m / scale_factor; + for (int n = 0; n < dst->ne[0]; n++) { + int i00 = n / scale_factor; + + const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03); + + float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]); + + *y = *x; + } + } + } + } +} + +static void ggml_compute_forward_upscale( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_upscale_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_flash_attn + +static void ggml_compute_forward_flash_attn_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const bool masked, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_LOCALS(int64_t, neq, q, ne); + GGML_TENSOR_LOCALS(size_t, nbq, q, nb); + GGML_TENSOR_LOCALS(int64_t, nek, k, ne); + GGML_TENSOR_LOCALS(size_t, nbk, k, nb); + GGML_TENSOR_LOCALS(int64_t, nev, v, ne); + GGML_TENSOR_LOCALS(size_t, nbv, v, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t D = neq0; + const int64_t N = neq1; + const int64_t P = nek1 - N; + const int64_t M = P + N; + + const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); + + GGML_ASSERT(ne0 == D); + GGML_ASSERT(ne1 == N); + GGML_ASSERT(P >= 0); + + GGML_ASSERT(nbq0 == sizeof(float)); + GGML_ASSERT(nbk0 == sizeof(float)); + GGML_ASSERT(nbv0 == sizeof(float)); + + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev1 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nek1 == N + P); + GGML_ASSERT(nev1 == D); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // parallelize by q rows using ggml_vec_dot_f32 + + // total rows in q + const int nr = neq1*neq2*neq3; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + const float scale = 1.0f/sqrtf(D); + + //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); + + for (int ir = ir0; ir < ir1; ++ir) { + // q indices + const int iq3 = ir/(neq2*neq1); + const int iq2 = (ir - iq3*neq2*neq1)/neq1; + const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); + + float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32); + + for (int i = M; i < Mup; ++i) { + S[i] = -INFINITY; + } + + for (int64_t ic = 0; ic < nek1; ++ic) { + // k indices + const int ik3 = iq3; + const int ik2 = iq2; + const int ik1 = ic; + + // S indices + const int i1 = ik1; + + ggml_vec_dot_f32(neq0, + S + i1, + (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + } + + // scale + ggml_vec_scale_f32(nek1, S, scale); + + if (masked) { + for (int64_t i = P; i < M; i++) { + if (i > P + iq1) { + S[i] = -INFINITY; + } + } + } + + // softmax + { + float max = -INFINITY; + ggml_vec_max_f32(M, &max, S); + + ggml_float sum = 0.0; + { +#ifdef GGML_SOFT_MAX_ACCELERATE + max = -max; + vDSP_vsadd(S, 1, &max, S, 1, Mup); + vvexpf(S, S, &Mup); + ggml_vec_sum_f32(Mup, &sum, S); +#else + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); + ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; + + for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { + float * SS = S + i; + + for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { + if (SS[j] == -INFINITY) { + SS[j] = 0.0f; + } else { +#ifndef GGML_FLASH_ATTN_EXP_FP16 + const float val = expf(SS[j] - max); +#else + ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); + memcpy(&scvt[j], &s, sizeof(uint16_t)); + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); +#endif + sump[j] += (ggml_float)val; + SS[j] = val; + } + } + } + + for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { + sum += sump[i]; + } +#endif + } + + assert(sum > 0.0); + + sum = 1.0/sum; + ggml_vec_scale_f32(M, S, sum); + +#ifndef NDEBUG + for (int i = 0; i < M; ++i) { + assert(!isnan(S[i])); + assert(!isinf(S[i])); + } +#endif + } for (int64_t ic = 0; ic < nev1; ++ic) { // dst indices @@ -13531,7 +14174,7 @@ static void wsp_ggml_compute_forward_flash_attn_f32( const int i2 = iq2; const int i3 = iq3; - wsp_ggml_vec_dot_f32(nek1, + ggml_vec_dot_f32(nek1, (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), S); @@ -13539,24 +14182,24 @@ static void wsp_ggml_compute_forward_flash_attn_f32( } } -static void wsp_ggml_compute_forward_flash_attn_f16( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * q, - const struct wsp_ggml_tensor * k, - const struct wsp_ggml_tensor * v, +static void ggml_compute_forward_flash_attn_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, const bool masked, - struct wsp_ggml_tensor * dst) { - int64_t t0 = wsp_ggml_perf_time_us(); + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - WSP_GGML_TENSOR_LOCALS(int64_t, neq, q, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbq, q, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, nek, k, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbk, k, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, nev, v, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbv, v, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, neq, q, ne); + GGML_TENSOR_LOCALS(size_t, nbq, q, nb); + GGML_TENSOR_LOCALS(int64_t, nek, k, ne); + GGML_TENSOR_LOCALS(size_t, nbk, k, nb); + GGML_TENSOR_LOCALS(int64_t, nev, v, ne); + GGML_TENSOR_LOCALS(size_t, nbv, v, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); const int ith = params->ith; const int nth = params->nth; @@ -13566,39 +14209,39 @@ static void wsp_ggml_compute_forward_flash_attn_f16( const int64_t P = nek1 - N; const int64_t M = P + N; - const int Mup = wsp_ggml_up(M, WSP_GGML_SOFT_MAX_UNROLL); + const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); - WSP_GGML_ASSERT(ne0 == D); - WSP_GGML_ASSERT(ne1 == N); - WSP_GGML_ASSERT(P >= 0); + GGML_ASSERT(ne0 == D); + GGML_ASSERT(ne1 == N); + GGML_ASSERT(P >= 0); - WSP_GGML_ASSERT(nbq0 == sizeof(wsp_ggml_fp16_t)); - WSP_GGML_ASSERT(nbk0 == sizeof(wsp_ggml_fp16_t)); - WSP_GGML_ASSERT(nbv0 == sizeof(wsp_ggml_fp16_t)); + GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t)); - WSP_GGML_ASSERT(neq0 == D); - WSP_GGML_ASSERT(nek0 == D); - WSP_GGML_ASSERT(nev1 == D); + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev1 == D); - WSP_GGML_ASSERT(neq1 == N); - WSP_GGML_ASSERT(nek1 == N + P); - WSP_GGML_ASSERT(nev1 == D); + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nek1 == N + P); + GGML_ASSERT(nev1 == D); // dst cannot be transposed or permuted - WSP_GGML_ASSERT(nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb0 <= nb1); - WSP_GGML_ASSERT(nb1 <= nb2); - WSP_GGML_ASSERT(nb2 <= nb3); + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); - if (params->type == WSP_GGML_TASK_INIT) { + if (params->type == GGML_TASK_INIT) { return; } - if (params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_FINALIZE) { return; } - // parallelize by q rows using wsp_ggml_vec_dot_f32 + // parallelize by q rows using ggml_vec_dot_f32 // total rows in q const int nr = neq1*neq2*neq3; @@ -13626,7 +14269,7 @@ static void wsp_ggml_compute_forward_flash_attn_f16( S[i] = -INFINITY; } - if (WSP_GGML_VEC_DOT_UNROLL > 2 || nek1 % WSP_GGML_VEC_DOT_UNROLL != 0) { + if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) { for (int64_t ic = 0; ic < nek1; ++ic) { // k indices const int ik3 = iq3; @@ -13636,13 +14279,13 @@ static void wsp_ggml_compute_forward_flash_attn_f16( // S indices const int i1 = ik1; - wsp_ggml_vec_dot_f16(neq0, + ggml_vec_dot_f16(neq0, S + i1, - (wsp_ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (wsp_ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), + (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); } } else { - for (int64_t ic = 0; ic < nek1; ic += WSP_GGML_VEC_DOT_UNROLL) { + for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) { // k indices const int ik3 = iq3; const int ik2 = iq2; @@ -13651,15 +14294,15 @@ static void wsp_ggml_compute_forward_flash_attn_f16( // S indices const int i1 = ik1; - wsp_ggml_vec_dot_f16_unroll(neq0, nbk1, + ggml_vec_dot_f16_unroll(neq0, nbk1, S + i1, ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), - (wsp_ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); + (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); } } // scale - wsp_ggml_vec_scale_f32(nek1, S, scale); + ggml_vec_scale_f32(nek1, S, scale); if (masked) { for (int64_t i = P; i < M; i++) { @@ -13672,36 +14315,36 @@ static void wsp_ggml_compute_forward_flash_attn_f16( // softmax { float max = -INFINITY; - wsp_ggml_vec_max_f32(M, &max, S); + ggml_vec_max_f32(M, &max, S); - wsp_ggml_float sum = 0.0; + ggml_float sum = 0.0; { -#ifdef WSP_GGML_SOFT_MAX_ACCELERATE +#ifdef GGML_SOFT_MAX_ACCELERATE max = -max; vDSP_vsadd(S, 1, &max, S, 1, Mup); vvexpf(S, S, &Mup); - wsp_ggml_vec_sum_f32(Mup, &sum, S); + ggml_vec_sum_f32(Mup, &sum, S); #else - uint16_t scvt[WSP_GGML_SOFT_MAX_UNROLL]; - wsp_ggml_float sump[WSP_GGML_SOFT_MAX_UNROLL] = { 0.0 }; + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; + ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; - for (int i = 0; i < Mup; i += WSP_GGML_SOFT_MAX_UNROLL) { + for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { float * SS = S + i; - for (int j = 0; j < WSP_GGML_SOFT_MAX_UNROLL; ++j) { + for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { if (SS[j] == -INFINITY) { SS[j] = 0.0f; } else { - wsp_ggml_fp16_t s = WSP_GGML_FP32_TO_FP16(SS[j] - max); + ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = WSP_GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); - sump[j] += (wsp_ggml_float)val; + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); + sump[j] += (ggml_float)val; SS[j] = val; } } } - for (int i = 0; i < WSP_GGML_SOFT_MAX_UNROLL; i++) { + for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { sum += sump[i]; } #endif @@ -13710,7 +14353,7 @@ static void wsp_ggml_compute_forward_flash_attn_f16( assert(sum > 0.0); sum = 1.0/sum; - wsp_ggml_vec_scale_f32(M, S, sum); + ggml_vec_scale_f32(M, S, sum); #ifndef NDEBUG for (int i = 0; i < M; ++i) { @@ -13720,32 +14363,32 @@ static void wsp_ggml_compute_forward_flash_attn_f16( #endif } - wsp_ggml_fp16_t * S16 = (wsp_ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup); + ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup); for (int64_t i = 0; i < M; i++) { - S16[i] = WSP_GGML_FP32_TO_FP16(S[i]); + S16[i] = GGML_FP32_TO_FP16(S[i]); } - if (WSP_GGML_VEC_DOT_UNROLL == 1 || (nev1 % WSP_GGML_VEC_DOT_UNROLL != 0)) { + if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) { for (int64_t ic = 0; ic < nev1; ++ic) { // dst indices const int i1 = iq1; const int i2 = iq2; const int i3 = iq3; - wsp_ggml_vec_dot_f16(nek1, + ggml_vec_dot_f16(nek1, (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (wsp_ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), + (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), S16); } } else { - for (int64_t ic = 0; ic < nev1; ic += WSP_GGML_VEC_DOT_UNROLL) { + for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) { // dst indices const int i1 = iq1; const int i2 = iq2; const int i3 = iq3; - wsp_ggml_vec_dot_f16_unroll(nek1, nbv1, + ggml_vec_dot_f16_unroll(nek1, nbv1, (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), S16); @@ -13754,54 +14397,54 @@ static void wsp_ggml_compute_forward_flash_attn_f16( } } -static void wsp_ggml_compute_forward_flash_attn( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * q, - const struct wsp_ggml_tensor * k, - const struct wsp_ggml_tensor * v, +static void ggml_compute_forward_flash_attn( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, const bool masked, - struct wsp_ggml_tensor * dst) { + struct ggml_tensor * dst) { switch (q->type) { - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - wsp_ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst); + ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst); + ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_flash_ff +// ggml_compute_forward_flash_ff -static void wsp_ggml_compute_forward_flash_ff_f16( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * a, // F16 - const struct wsp_ggml_tensor * b0, // F16 fc_w - const struct wsp_ggml_tensor * b1, // F32 fc_b - const struct wsp_ggml_tensor * c0, // F16 proj_w - const struct wsp_ggml_tensor * c1, // F32 proj_b - struct wsp_ggml_tensor * dst) { - int64_t t0 = wsp_ggml_perf_time_us(); +static void ggml_compute_forward_flash_ff_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, // F16 + const struct ggml_tensor * b0, // F16 fc_w + const struct ggml_tensor * b1, // F32 fc_b + const struct ggml_tensor * c0, // F16 proj_w + const struct ggml_tensor * c1, // F32 proj_b + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - WSP_GGML_TENSOR_LOCALS(int64_t, nea, a, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nba, a, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, nea, a, ne); + GGML_TENSOR_LOCALS(size_t, nba, a, nb); + GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne); + GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb); + GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne); + GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb); + GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne); + GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb); + GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne); + GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); const int ith = params->ith; const int nth = params->nth; @@ -13810,41 +14453,41 @@ static void wsp_ggml_compute_forward_flash_ff_f16( //const int64_t N = nea1; const int64_t M = neb01; - WSP_GGML_ASSERT(ne0 == nea0); - WSP_GGML_ASSERT(ne1 == nea1); - WSP_GGML_ASSERT(ne2 == nea2); + GGML_ASSERT(ne0 == nea0); + GGML_ASSERT(ne1 == nea1); + GGML_ASSERT(ne2 == nea2); - WSP_GGML_ASSERT(nba0 == sizeof(wsp_ggml_fp16_t)); - WSP_GGML_ASSERT(nbb00 == sizeof(wsp_ggml_fp16_t)); - WSP_GGML_ASSERT(nbb10 == sizeof(float)); - WSP_GGML_ASSERT(nbc00 == sizeof(wsp_ggml_fp16_t)); - WSP_GGML_ASSERT(nbc10 == sizeof(float)); + GGML_ASSERT(nba0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbb10 == sizeof(float)); + GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbc10 == sizeof(float)); - WSP_GGML_ASSERT(neb00 == D); - WSP_GGML_ASSERT(neb01 == M); - WSP_GGML_ASSERT(neb10 == M); - WSP_GGML_ASSERT(neb11 == 1); + GGML_ASSERT(neb00 == D); + GGML_ASSERT(neb01 == M); + GGML_ASSERT(neb10 == M); + GGML_ASSERT(neb11 == 1); - WSP_GGML_ASSERT(nec00 == M); - WSP_GGML_ASSERT(nec01 == D); - WSP_GGML_ASSERT(nec10 == D); - WSP_GGML_ASSERT(nec11 == 1); + GGML_ASSERT(nec00 == M); + GGML_ASSERT(nec01 == D); + GGML_ASSERT(nec10 == D); + GGML_ASSERT(nec11 == 1); // dst cannot be transposed or permuted - WSP_GGML_ASSERT(nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb0 <= nb1); - WSP_GGML_ASSERT(nb1 <= nb2); - WSP_GGML_ASSERT(nb2 <= nb3); + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); - if (params->type == WSP_GGML_TASK_INIT) { + if (params->type == GGML_TASK_INIT) { return; } - if (params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_FINALIZE) { return; } - // parallelize by a rows using wsp_ggml_vec_dot_f32 + // parallelize by a rows using ggml_vec_dot_f32 // total rows in a const int nr = nea1*nea2*nea3; @@ -13873,22 +14516,22 @@ static void wsp_ggml_compute_forward_flash_ff_f16( // S indices const int i1 = ib01; - wsp_ggml_vec_dot_f16(nea0, + ggml_vec_dot_f16(nea0, S + i1, - (wsp_ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), - (wsp_ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3))); + (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), + (ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3))); } - wsp_ggml_vec_add_f32(neb01, S, S, (float *) b1->data); - //wsp_ggml_vec_gelu_f32(neb01, S, S); + ggml_vec_add_f32(neb01, S, S, (float *) b1->data); + //ggml_vec_gelu_f32(neb01, S, S); - wsp_ggml_fp16_t * S16 = (wsp_ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M); + ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M); for (int64_t i = 0; i < M; i++) { - S16[i] = WSP_GGML_FP32_TO_FP16(S[i]); + S16[i] = GGML_FP32_TO_FP16(S[i]); } - wsp_ggml_vec_gelu_f16(neb01, S16, S16); + ggml_vec_gelu_f16(neb01, S16, S16); { // dst indices @@ -13898,13 +14541,13 @@ static void wsp_ggml_compute_forward_flash_ff_f16( for (int64_t ic = 0; ic < nec01; ++ic) { - wsp_ggml_vec_dot_f16(neb01, + ggml_vec_dot_f16(neb01, (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), - (wsp_ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), + (ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), S16); } - wsp_ggml_vec_add_f32(nec01, + ggml_vec_add_f32(nec01, (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)), (float *) c1->data); @@ -13912,53 +14555,53 @@ static void wsp_ggml_compute_forward_flash_ff_f16( } } -static void wsp_ggml_compute_forward_flash_ff( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * a, - const struct wsp_ggml_tensor * b0, - const struct wsp_ggml_tensor * b1, - const struct wsp_ggml_tensor * c0, - const struct wsp_ggml_tensor * c1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_flash_ff( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b0, + const struct ggml_tensor * b1, + const struct ggml_tensor * c0, + const struct ggml_tensor * c1, + struct ggml_tensor * dst) { switch (b0->type) { - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - wsp_ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst); + ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst); } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - WSP_GGML_ASSERT(false); // TODO + GGML_ASSERT(false); // TODO } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_flash_attn_back +// ggml_compute_forward_flash_attn_back -static void wsp_ggml_compute_forward_flash_attn_back_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * q, - const struct wsp_ggml_tensor * k, - const struct wsp_ggml_tensor * v, - const struct wsp_ggml_tensor * d, +static void ggml_compute_forward_flash_attn_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const struct ggml_tensor * d, const bool masked, - struct wsp_ggml_tensor * dst) { - int64_t t0 = wsp_ggml_perf_time_us(); + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - WSP_GGML_TENSOR_LOCALS(int64_t, neq, q, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbq, q, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, nek, k, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbk, k, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, nev, v, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbv, v, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, ned, d, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nbd, d, nb); - WSP_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - WSP_GGML_TENSOR_LOCALS(size_t, nb, dst, nb); + GGML_TENSOR_LOCALS(int64_t, neq, q, ne); + GGML_TENSOR_LOCALS(size_t, nbq, q, nb); + GGML_TENSOR_LOCALS(int64_t, nek, k, ne); + GGML_TENSOR_LOCALS(size_t, nbk, k, nb); + GGML_TENSOR_LOCALS(int64_t, nev, v, ne); + GGML_TENSOR_LOCALS(size_t, nbv, v, nb); + GGML_TENSOR_LOCALS(int64_t, ned, d, ne); + GGML_TENSOR_LOCALS(size_t, nbd, d, nb); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(size_t, nb, dst, nb); const int ith = params->ith; const int nth = params->nth; @@ -13968,45 +14611,45 @@ static void wsp_ggml_compute_forward_flash_attn_back_f32( const int64_t P = nek1 - N; const int64_t M = P + N; - const int Mup = wsp_ggml_up(M, WSP_GGML_SOFT_MAX_UNROLL); + const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL); const int mxDM = MAX(D, Mup); - // WSP_GGML_ASSERT(ne0 == D); - // WSP_GGML_ASSERT(ne1 == N); - WSP_GGML_ASSERT(P >= 0); + // GGML_ASSERT(ne0 == D); + // GGML_ASSERT(ne1 == N); + GGML_ASSERT(P >= 0); - WSP_GGML_ASSERT(nbq0 == sizeof(float)); - WSP_GGML_ASSERT(nbk0 == sizeof(float)); - WSP_GGML_ASSERT(nbv0 == sizeof(float)); + GGML_ASSERT(nbq0 == sizeof(float)); + GGML_ASSERT(nbk0 == sizeof(float)); + GGML_ASSERT(nbv0 == sizeof(float)); - WSP_GGML_ASSERT(neq0 == D); - WSP_GGML_ASSERT(nek0 == D); - WSP_GGML_ASSERT(nev1 == D); - WSP_GGML_ASSERT(ned0 == D); + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev1 == D); + GGML_ASSERT(ned0 == D); - WSP_GGML_ASSERT(neq1 == N); - WSP_GGML_ASSERT(nek1 == N + P); - WSP_GGML_ASSERT(nev1 == D); - WSP_GGML_ASSERT(ned1 == N); + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nek1 == N + P); + GGML_ASSERT(nev1 == D); + GGML_ASSERT(ned1 == N); // dst cannot be transposed or permuted - WSP_GGML_ASSERT(nb0 == sizeof(float)); - WSP_GGML_ASSERT(nb0 <= nb1); - WSP_GGML_ASSERT(nb1 <= nb2); - WSP_GGML_ASSERT(nb2 <= nb3); + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); - if (params->type == WSP_GGML_TASK_INIT) { + if (params->type == GGML_TASK_INIT) { if (ith == 0) { memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); } return; } - if (params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_FINALIZE) { return; } - // parallelize by q rows using wsp_ggml_vec_dot_f32 + // parallelize by q rows using ggml_vec_dot_f32 // total rows in q const int nr = neq2*neq3; @@ -14047,14 +14690,14 @@ static void wsp_ggml_compute_forward_flash_attn_back_f32( // S indices const int i1 = ik1; - wsp_ggml_vec_dot_f32(neq0, + ggml_vec_dot_f32(neq0, S + i1, (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3))); } // scale - wsp_ggml_vec_scale_f32(nek1, S, scale); + ggml_vec_scale_f32(nek1, S, scale); if (masked) { for (int64_t i = P; i < M; i++) { @@ -14067,37 +14710,41 @@ static void wsp_ggml_compute_forward_flash_attn_back_f32( // softmax { float max = -INFINITY; - wsp_ggml_vec_max_f32(M, &max, S); + ggml_vec_max_f32(M, &max, S); - wsp_ggml_float sum = 0.0; + ggml_float sum = 0.0; { -#ifdef WSP_GGML_SOFT_MAX_ACCELERATE +#ifdef GGML_SOFT_MAX_ACCELERATE max = -max; vDSP_vsadd(SM, 1, &max, SM, 1, Mup); vvexpf(SM, SM, &Mup); - wsp_ggml_vec_sum_f32(Mup, &sum, SM); + ggml_vec_sum_f32(Mup, &sum, SM); #else - uint16_t scvt[WSP_GGML_SOFT_MAX_UNROLL]; - wsp_ggml_float sump[WSP_GGML_SOFT_MAX_UNROLL] = { 0.0 }; + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); + ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; - for (int i = 0; i < Mup; i += WSP_GGML_SOFT_MAX_UNROLL) { + for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { float * SR = S + i; float * SW = SM + i; - for (int j = 0; j < WSP_GGML_SOFT_MAX_UNROLL; ++j) { + for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) { if (SR[j] == -INFINITY) { SW[j] = 0.0f; } else { - wsp_ggml_fp16_t s = WSP_GGML_FP32_TO_FP16(SR[j] - max); +#ifndef GGML_FLASH_ATTN_EXP_FP16 + const float val = expf(SR[j] - max); +#else + ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); - const float val = WSP_GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); - sump[j] += (wsp_ggml_float)val; + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); +#endif + sump[j] += (ggml_float)val; SW[j] = val; } } } - for (int i = 0; i < WSP_GGML_SOFT_MAX_UNROLL; i++) { + for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) { sum += sump[i]; } #endif @@ -14106,7 +14753,7 @@ static void wsp_ggml_compute_forward_flash_attn_back_f32( assert(sum > 0.0); sum = 1.0/sum; - wsp_ggml_vec_scale_f32(M, SM, sum); + ggml_vec_scale_f32(M, SM, sum); } @@ -14174,14 +14821,14 @@ static void wsp_ggml_compute_forward_flash_attn_back_f32( // S = gradSM = d[:D,iq1,iq2,iq3] @ vcur // S = d[:D,iq1,iq2,iq3] @ vcur // S[:M] += vcur[:M,ic] * d[ic,iq1,iq2,iq3] - wsp_ggml_vec_set_f32(M, S, 0); + ggml_vec_set_f32(M, S, 0); for (int64_t ic = 0; ic < D; ++ic) { // dst indices const int i1 = iq1; const int i2 = iq2; const int i3 = iq3; - wsp_ggml_vec_mad_f32(M, + ggml_vec_mad_f32(M, S, (float *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)), *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3))); @@ -14189,9 +14836,9 @@ static void wsp_ggml_compute_forward_flash_attn_back_f32( // S = SM * (S - dot(SM, S)) float dot_SM_gradSM = 0; - wsp_ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S); - wsp_ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); - wsp_ggml_vec_mul_f32 (M, S, S, SM); + ggml_vec_dot_f32 (M, &dot_SM_gradSM, SM, S); + ggml_vec_acc1_f32(M, S, -dot_SM_gradSM); + ggml_vec_mul_f32 (M, S, S, SM); // S = diag_mask_zero(S, P) * scale if (masked) { @@ -14204,7 +14851,7 @@ static void wsp_ggml_compute_forward_flash_attn_back_f32( } } } - wsp_ggml_vec_scale_f32(M, S, scale); + ggml_vec_scale_f32(M, S, scale); void * grad_q = (char *) dst->data; void * grad_k = (char *) dst->data + nb0*D*N*neq2*neq3; @@ -14240,7 +14887,7 @@ static void wsp_ggml_compute_forward_flash_attn_back_f32( const int i2 = iq2; const int i3 = iq3; - wsp_ggml_vec_mad_f32(D, + ggml_vec_mad_f32(D, (float *) ((char *) grad_q + (i1*nbgq1 + i2*nbgq2 + i3*nbgq3)), (float *) ((char *) k->data + (ic*nbk1 + i2*nbk2 + i3*nbk3)), S[ic]); @@ -14255,10 +14902,10 @@ static void wsp_ggml_compute_forward_flash_attn_back_f32( const int i2 = iq2; const int i3 = iq3; - // wsp_ggml_vec_set_f32(D, + // ggml_vec_set_f32(D, // (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), // 0); - wsp_ggml_vec_mad_f32(D, + ggml_vec_mad_f32(D, (float *) ((char *) grad_k + (ic*nbgk1 + i2*nbgk2 + i3*nbgk3)), (float *) ((char *) q->data + (i1*nbq1 + i2*nbq2 + i3*nbq3)), S[ic]); @@ -14273,10 +14920,10 @@ static void wsp_ggml_compute_forward_flash_attn_back_f32( const int i2 = iq2; const int i3 = iq3; - // wsp_ggml_vec_set_f32(M, + // ggml_vec_set_f32(M, // (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), // 0); - wsp_ggml_vec_mad_f32(M, + ggml_vec_mad_f32(M, (float *) ((char *) grad_v + ( ic*nbgv1 + i2*nbgv2 + i3*nbgv3)), SM, *(float *) ((char *) d->data + (ic*nbd0 + i1*nbd1 + i2*nbd2 + i3*nbd3))); @@ -14285,43 +14932,42 @@ static void wsp_ggml_compute_forward_flash_attn_back_f32( } } -static void wsp_ggml_compute_forward_flash_attn_back( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * q, - const struct wsp_ggml_tensor * k, - const struct wsp_ggml_tensor * v, - const struct wsp_ggml_tensor * d, +static void ggml_compute_forward_flash_attn_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const struct ggml_tensor * d, const bool masked, - struct wsp_ggml_tensor * dst) { + struct ggml_tensor * dst) { switch (q->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst); + ggml_compute_forward_flash_attn_back_f32(params, q, k, v, d, masked, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_win_part +// ggml_compute_forward_win_part -static void wsp_ggml_compute_forward_win_part_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { +static void ggml_compute_forward_win_part_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - WSP_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - const int32_t nep0 = ((const int32_t *)(opt0->data))[0]; - const int32_t nep1 = ((const int32_t *)(opt0->data))[1]; - const int32_t w = ((const int32_t *)(opt0->data))[2]; + const int32_t nep0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t nep1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t w = ((const int32_t *)(dst->op_params))[2]; assert(ne00 == ne0); assert(ne3 == nep0*nep1); @@ -14352,38 +14998,36 @@ static void wsp_ggml_compute_forward_win_part_f32( } } -static void wsp_ggml_compute_forward_win_part( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_win_part( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_win_part_f32(params, src0, opt0, dst); + ggml_compute_forward_win_part_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_win_unpart +// ggml_compute_forward_win_unpart -static void wsp_ggml_compute_forward_win_unpart_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { +static void ggml_compute_forward_win_unpart_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); - WSP_GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - const int32_t w = ((const int32_t *)(opt0->data))[0]; + const int32_t w = ((const int32_t *)(dst->op_params))[0]; // padding const int px = (w - ne1%w)%w; @@ -14414,235 +15058,413 @@ static void wsp_ggml_compute_forward_win_unpart_f32( } } -static void wsp_ggml_compute_forward_win_unpart( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_win_unpart( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst); + ggml_compute_forward_win_unpart_f32(params, src0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_map_unary +//gmml_compute_forward_unary + +static void ggml_compute_forward_unary( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + const enum ggml_unary_op op = ggml_get_unary_op(dst); + + switch (op) { + case GGML_UNARY_OP_ABS: + { + ggml_compute_forward_abs(params, src0, dst); + } break; + case GGML_UNARY_OP_SGN: + { + ggml_compute_forward_sgn(params, src0, dst); + } break; + case GGML_UNARY_OP_NEG: + { + ggml_compute_forward_neg(params, src0, dst); + } break; + case GGML_UNARY_OP_STEP: + { + ggml_compute_forward_step(params, src0, dst); + } break; + case GGML_UNARY_OP_TANH: + { + ggml_compute_forward_tanh(params, src0, dst); + } break; + case GGML_UNARY_OP_ELU: + { + ggml_compute_forward_elu(params, src0, dst); + } break; + case GGML_UNARY_OP_RELU: + { + ggml_compute_forward_relu(params, src0, dst); + } break; + case GGML_UNARY_OP_GELU: + { + ggml_compute_forward_gelu(params, src0, dst); + } break; + case GGML_UNARY_OP_GELU_QUICK: + { + ggml_compute_forward_gelu_quick(params, src0, dst); + } break; + case GGML_UNARY_OP_SILU: + { + ggml_compute_forward_silu(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} -static void wsp_ggml_compute_forward_map_unary_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst, - const wsp_ggml_unary_op_f32_t fun) { - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst)); +// ggml_compute_forward_get_rel_pos - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { +static void ggml_compute_forward_get_rel_pos_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = wsp_ggml_nrows(src0); - const int nc = src0->ne[0]; + // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322 - assert( dst->nb[0] == sizeof(float)); - assert(src0->nb[0] == sizeof(float)); + GGML_TENSOR_UNARY_OP_LOCALS; - for (int i = 0; i < n; i++) { - fun(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); + const int64_t w = ne1; + + ggml_fp16_t * src0_data = (ggml_fp16_t *) src0->data; + ggml_fp16_t * dst_data = (ggml_fp16_t *) dst->data; + + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = 0; i1 < ne1; ++i1) { + const int64_t pos = (w - i1 - 1) + i2; + for (int64_t i0 = 0; i0 < ne0; ++i0) { + dst_data[i2*ne1*ne0 + i1*ne0 + i0] = src0_data[pos*ne00 + i0]; + } + } + } +} + +static void ggml_compute_forward_get_rel_pos( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_get_rel_pos_f16(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; } } +// ggml_compute_forward_add_rel_pos + +static void ggml_compute_forward_add_rel_pos_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * src2, + struct ggml_tensor * dst) { + + const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; + if (!inplace && params->type == GGML_TASK_INIT) { + memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); + return; + } + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359 + + float * src1_data = (float *) src1->data; + float * src2_data = (float *) src2->data; + float * dst_data = (float *) dst->data; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int ith = params->ith; + const int nth = params->nth; + + // total patches in dst + const int np = ne13; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); + + + for (int64_t i13 = ip0; i13 < ip1; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + const int64_t jp1 = i13*ne12*ne11*ne10 + i12*ne11*ne10 + i11*ne10; + for (int64_t i10 = 0; i10 < ne10; ++i10) { + const int64_t jp0 = jp1 + i10; + const float src1_e = src1_data[jp0]; + const float src2_e = src2_data[jp0]; + + const int64_t jdh = jp0 * ne10; + const int64_t jdw = jdh - (ne10 - 1) * i10; + + for (int64_t j = 0; j < ne10; ++j) { + dst_data[jdh + j ] += src2_e; + dst_data[jdw + j*ne10] += src1_e; + } + } + } + } + } +} -static void wsp_ggml_compute_forward_map_unary( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - struct wsp_ggml_tensor * dst, - const wsp_ggml_unary_op_f32_t fun) { +static void ggml_compute_forward_add_rel_pos( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * src2, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_map_unary_f32(params, src0, dst, fun); + ggml_compute_forward_add_rel_pos_f32(params, src0, src1, src2, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_map_binary +// ggml_compute_forward_map_unary -static void wsp_ggml_compute_forward_map_binary_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst, - const wsp_ggml_binary_op_f32_t fun) { - assert(params->ith == 0); - assert(wsp_ggml_are_same_shape(src0, src1) && wsp_ggml_are_same_shape(src0, dst)); +static void ggml_compute_forward_map_unary_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst, + const ggml_unary_op_f32_t fun) { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n = wsp_ggml_nrows(src0); + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; assert( dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); - assert(src1->nb[0] == sizeof(float)); for (int i = 0; i < n; i++) { fun(nc, (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1])), - (float *) ((char *) src1->data + i*(src1->nb[1]))); + (float *) ((char *) src0->data + i*(src0->nb[1]))); } } -static void wsp_ggml_compute_forward_map_binary( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst, - const wsp_ggml_binary_op_f32_t fun) { +static void ggml_compute_forward_map_unary( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst, + const ggml_unary_op_f32_t fun) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun); + ggml_compute_forward_map_unary_f32(params, src0, dst, fun); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_map_custom1 +// ggml_compute_forward_map_binary -static void wsp_ggml_compute_forward_map_custom1_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * dst, - const wsp_ggml_custom1_op_f32_t fun) { +static void ggml_compute_forward_map_binary_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + const ggml_binary_op_f32_t fun) { assert(params->ith == 0); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - fun(dst, a); + const int n = ggml_nrows(src0); + const int nc = src0->ne[0]; + + assert( dst->nb[0] == sizeof(float)); + assert(src0->nb[0] == sizeof(float)); + assert(src1->nb[0] == sizeof(float)); + + for (int i = 0; i < n; i++) { + fun(nc, + (float *) ((char *) dst->data + i*( dst->nb[1])), + (float *) ((char *) src0->data + i*(src0->nb[1])), + (float *) ((char *) src1->data + i*(src1->nb[1]))); + } } -static void wsp_ggml_compute_forward_map_custom1( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * dst, - const wsp_ggml_custom1_op_f32_t fun) { - switch (a->type) { - case WSP_GGML_TYPE_F32: +static void ggml_compute_forward_map_binary( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst, + const ggml_binary_op_f32_t fun) { + switch (src0->type) { + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_map_custom1_f32(params, a, dst, fun); + ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_map_custom2 +// ggml_compute_forward_map_custom1 -static void wsp_ggml_compute_forward_map_custom2_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * a, - const struct wsp_ggml_tensor * b, - struct wsp_ggml_tensor * dst, - const wsp_ggml_custom2_op_f32_t fun) { +static void ggml_compute_forward_map_custom1_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + struct ggml_tensor * dst, + const ggml_custom1_op_f32_t fun) { assert(params->ith == 0); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - fun(dst, a, b); + fun(dst, a); } +// ggml_compute_forward_map_custom2 -static void wsp_ggml_compute_forward_map_custom2( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * a, - const struct wsp_ggml_tensor * b, - struct wsp_ggml_tensor * dst, - const wsp_ggml_custom2_op_f32_t fun) { - switch (a->type) { - case WSP_GGML_TYPE_F32: - { - wsp_ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun); - } break; - default: - { - WSP_GGML_ASSERT(false); - } break; +static void ggml_compute_forward_map_custom2_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + struct ggml_tensor * dst, + const ggml_custom2_op_f32_t fun) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; } + + fun(dst, a, b); } -// wsp_ggml_compute_forward_map_custom3 -static void wsp_ggml_compute_forward_map_custom3_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * a, - const struct wsp_ggml_tensor * b, - const struct wsp_ggml_tensor * c, - struct wsp_ggml_tensor * dst, - const wsp_ggml_custom3_op_f32_t fun) { +// ggml_compute_forward_map_custom3 + +static void ggml_compute_forward_map_custom3_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + const struct ggml_tensor * c, + struct ggml_tensor * dst, + const ggml_custom3_op_f32_t fun) { assert(params->ith == 0); - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } fun(dst, a, b, c); } +// ggml_compute_forward_map_custom1 -static void wsp_ggml_compute_forward_map_custom3( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * a, - const struct wsp_ggml_tensor * b, - const struct wsp_ggml_tensor * c, - struct wsp_ggml_tensor * dst, - const wsp_ggml_custom3_op_f32_t fun) { - switch (a->type) { - case WSP_GGML_TYPE_F32: - { - wsp_ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun); - } break; - default: - { - WSP_GGML_ASSERT(false); - } break; +static void ggml_compute_forward_map_custom1( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params; + + p->fun(dst, a, params->ith, params->nth, p->userdata); +} + +// ggml_compute_forward_map_custom2 + +static void ggml_compute_forward_map_custom2( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params; + + p->fun(dst, a, b, params->ith, params->nth, p->userdata); +} + +// ggml_compute_forward_map_custom3 + +static void ggml_compute_forward_map_custom3( + const struct ggml_compute_params * params, + const struct ggml_tensor * a, + const struct ggml_tensor * b, + const struct ggml_tensor * c, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; } + + struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params; + + p->fun(dst, a, b, c, params->ith, params->nth, p->userdata); } -// wsp_ggml_compute_forward_cross_entropy_loss +// ggml_compute_forward_cross_entropy_loss -static void wsp_ggml_compute_forward_cross_entropy_loss_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src1)); - WSP_GGML_ASSERT(wsp_ggml_is_scalar(dst)); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, src1)); +static void ggml_compute_forward_cross_entropy_loss_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_scalar(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, src1)); const int ith = params->ith; const int nth = params->nth; @@ -14651,20 +15473,22 @@ static void wsp_ggml_compute_forward_cross_entropy_loss_f32( // TODO: handle transposed/permuted matrices const int nc = src0->ne[0]; - const int nr = wsp_ggml_nrows(src0); + const int nr = ggml_nrows(src0); - if (params->type == WSP_GGML_TASK_INIT) { + GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc)); + + if (params->type == GGML_TASK_INIT) { if (ith == 0) { memset(sums, 0, sizeof(float) * (nth + nth * nc)); } return; } - if (params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_FINALIZE) { if (ith == 0) { float * dp = (float *) dst->data; - wsp_ggml_vec_sum_f32(nth, dp, sums); - dp[0] *= -1.0f; + ggml_vec_sum_f32(nth, dp, sums); + dp[0] *= -1.0f / (float) nr; } return; } @@ -14681,7 +15505,7 @@ static void wsp_ggml_compute_forward_cross_entropy_loss_f32( for (int i1 = ir0; i1 < ir1; i1++) { float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); - float * st = (float *) params->wdata + nth + ith*nc; + float * st = ((float *) params->wdata) + nth + ith*nc; #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -14691,21 +15515,25 @@ static void wsp_ggml_compute_forward_cross_entropy_loss_f32( } #endif // soft_max - wsp_ggml_float sum = 0.0; + ggml_float sum = 0.0; { float max = -INFINITY; - wsp_ggml_vec_max_f32(nc, &max, s0); + ggml_vec_max_f32(nc, &max, s0); - uint16_t scvt; + uint16_t scvt; UNUSED(scvt); for (int i = 0; i < nc; i++) { if (s0[i] == -INFINITY) { st[i] = 0.0f; } else { - // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max); - wsp_ggml_fp16_t s = WSP_GGML_FP32_TO_FP16(s0[i] - max); +#ifndef GGML_CROSS_ENTROPY_EXP_FP16 + const float s = s0[i] - max; + const float val = expf(s); +#else + ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); memcpy(&scvt, &s, sizeof(scvt)); - const float val = WSP_GGML_FP16_TO_FP32(table_exp_f16[scvt]); - sum += (wsp_ggml_float)val; + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); +#endif + sum += (ggml_float)val; st[i] = val; } } @@ -14715,12 +15543,14 @@ static void wsp_ggml_compute_forward_cross_entropy_loss_f32( } // avoid log(0) by rescaling from [0..1] to [eps..1] sum = (1.0 - eps) / sum; - wsp_ggml_vec_scale_f32(nc, st, sum); - wsp_ggml_vec_add1_f32(nc, st, st, eps); - wsp_ggml_vec_log_f32(nc, st, st); - wsp_ggml_vec_mul_f32(nc, st, st, s1); + ggml_vec_scale_f32(nc, st, sum); + ggml_vec_add1_f32(nc, st, st, eps); + ggml_vec_log_f32(nc, st, st); + ggml_vec_mul_f32(nc, st, st, s1); - wsp_ggml_vec_sum_f32(nc, sums + ith, st); + float st_sum = 0; + ggml_vec_sum_f32(nc, &st_sum, st); + sums[ith] += st_sum; #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -14732,49 +15562,49 @@ static void wsp_ggml_compute_forward_cross_entropy_loss_f32( } -static void wsp_ggml_compute_forward_cross_entropy_loss( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_cross_entropy_loss( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst); + ggml_compute_forward_cross_entropy_loss_f32(params, src0, src1, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -// wsp_ggml_compute_forward_cross_entropy_loss_back +// ggml_compute_forward_cross_entropy_loss_back -static void wsp_ggml_compute_forward_cross_entropy_loss_back_f32( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src1)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(opt0)); - WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, src1) && wsp_ggml_are_same_shape(src0, dst)); +static void ggml_compute_forward_cross_entropy_loss_back_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(opt0)); + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); const int64_t ith = params->ith; const int64_t nth = params->nth; - if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const float eps = 1e-9f; + const double eps = 1e-9; // TODO: handle transposed/permuted matrices const int64_t nc = src0->ne[0]; - const int64_t nr = wsp_ggml_nrows(src0); + const int64_t nr = ggml_nrows(src0); // rows per thread const int64_t dr = (nr + nth - 1)/nth; @@ -14789,7 +15619,6 @@ static void wsp_ggml_compute_forward_cross_entropy_loss_back_f32( float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); - float * sm = (float *) params->wdata + ith*nc; #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -14798,94 +15627,44 @@ static void wsp_ggml_compute_forward_cross_entropy_loss_back_f32( assert(!isnan(s1[i])); } #endif - // step by step explanation: - { - //float * sums = (float *) params->wdata; - - // forward pass with annotated gradients from backward pass - // (built by going in reverse operation order, adding to gradients of current operation args) - // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum - // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1])) - // wsp_ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps) - // wsp_ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3] - // wsp_ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3 - // wsp_ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1 - // wsp_ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]] - // wsp_ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel] - - // substitute into grad[st1], because we can reuse softmax_back from this point on - // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps)) - // postorder: - // grad[st1] := softmax(s0) - // grad[st1] := grad[st1]*(1.0 - eps) - // grad[st1] := grad[st1] + eps - // grad[st1] := s1 / grad[st1] - // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel] - - // src0 gradients by going through softmax_back - // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1])) - // from softmax_back: - // dxk = yk * (dyk - dot(y, dy)) - // dot_y_dy := dot(y, dy) - // dx := dy - // dx := dx - dot_y_dy - // dx := dx * y - // postorder: - // dot_st1_dst1 := dot(st1, grad[st1]) - // grad[s0] := grad[st1] - // grad[s0] := grad[s0] - dot_st1_dst1 - // grad[s0] := grad[s0] * st1 - - // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1] - // sm := softmax(s0) - // grad[s0] := sm*(1.0 - eps) - // grad[s0] := grad[s0] + eps - // grad[s0] := s1 / grad[s0] - // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel] - // dot_st1_dst1 := dot(sm, grad[s0]) - // grad[s0] := grad[s0] - dot_st1_dst1 - // grad[s0] := grad[s0] * sm - } // soft_max - wsp_ggml_float sum = 0.0; + ggml_float sum = 0.0; { float max = -INFINITY; - wsp_ggml_vec_max_f32(nc, &max, s0); + ggml_vec_max_f32(nc, &max, s0); - uint16_t scvt; + uint16_t scvt; UNUSED(scvt); for (int i = 0; i < nc; i++) { if (s0[i] == -INFINITY) { - sm[i] = 0.0f; + ds0[i] = 0.0f; } else { - // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max); - wsp_ggml_fp16_t s = WSP_GGML_FP32_TO_FP16(s0[i] - max); +#ifndef GGML_CROSS_ENTROPY_EXP_FP16 + const float s = s0[i] - max; + const float val = expf(s); +#else + ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); memcpy(&scvt, &s, sizeof(scvt)); - const float val = WSP_GGML_FP16_TO_FP32(table_exp_f16[scvt]); - sum += (wsp_ggml_float)val; - sm[i] = val; + const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); +#endif + sum += (ggml_float)val; + ds0[i] = val; } } assert(sum > 0.0); - sum = 1.0/sum; + sum = (1.0 - eps)/sum; } - float dot_st1_dst1 = 0; - wsp_ggml_vec_scale_f32(nc, sm, sum); - wsp_ggml_vec_cpy_f32 (nc, ds0, sm); - wsp_ggml_vec_scale_f32(nc, ds0, (1.0f - eps)); - wsp_ggml_vec_add1_f32 (nc, ds0, ds0, eps); - wsp_ggml_vec_div_f32 (nc, ds0, s1, ds0); - wsp_ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]); - wsp_ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0); - wsp_ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1); - wsp_ggml_vec_mul_f32 (nc, ds0, ds0, sm); + // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr + ggml_vec_scale_f32(nc, ds0, sum); + ggml_vec_add1_f32(nc, ds0, ds0, eps); + ggml_vec_sub_f32(nc, ds0, ds0, s1); + ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr); + #ifndef NDEBUG for (int i = 0; i < nc; ++i) { - assert(!isnan(sm[i])); - assert(!isinf(sm[i])); assert(!isnan(ds0[i])); assert(!isinf(ds0[i])); } @@ -14893,20 +15672,20 @@ static void wsp_ggml_compute_forward_cross_entropy_loss_back_f32( } } -static void wsp_ggml_compute_forward_cross_entropy_loss_back( - const struct wsp_ggml_compute_params * params, - const struct wsp_ggml_tensor * src0, - const struct wsp_ggml_tensor * src1, - const struct wsp_ggml_tensor * opt0, - struct wsp_ggml_tensor * dst) { +static void ggml_compute_forward_cross_entropy_loss_back( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { switch (src0->type) { - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { - wsp_ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst); + ggml_compute_forward_cross_entropy_loss_back_f32(params, src0, src1, opt0, dst); } break; default: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } @@ -14914,356 +15693,370 @@ static void wsp_ggml_compute_forward_cross_entropy_loss_back( ///////////////////////////////// -static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * tensor) { - WSP_GGML_ASSERT(params); +static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + GGML_ASSERT(params); -#ifdef WSP_GGML_USE_CUBLAS - bool skip_cpu = wsp_ggml_cuda_compute_forward(params, tensor); +#ifdef GGML_USE_CUBLAS + bool skip_cpu = ggml_cuda_compute_forward(params, tensor); if (skip_cpu) { return; } - WSP_GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == WSP_GGML_BACKEND_CPU); - WSP_GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == WSP_GGML_BACKEND_CPU); -#endif // WSP_GGML_USE_CUBLAS + GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); +#endif // GGML_USE_CUBLAS switch (tensor->op) { - case WSP_GGML_OP_DUP: + case GGML_OP_DUP: { - wsp_ggml_compute_forward_dup(params, tensor->src0, tensor); + ggml_compute_forward_dup(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_ADD: + case GGML_OP_ADD: { - wsp_ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_add(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_ADD1: + case GGML_OP_ADD1: { - wsp_ggml_compute_forward_add1(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_add1(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_ACC: + case GGML_OP_ACC: { - wsp_ggml_compute_forward_acc(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_SUB: + case GGML_OP_SUB: { - wsp_ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_sub(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_MUL: + case GGML_OP_MUL: { - wsp_ggml_compute_forward_mul(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_mul(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_DIV: + case GGML_OP_DIV: { - wsp_ggml_compute_forward_div(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_div(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_SQR: + case GGML_OP_SQR: { - wsp_ggml_compute_forward_sqr(params, tensor->src0, tensor); + ggml_compute_forward_sqr(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_SQRT: + case GGML_OP_SQRT: { - wsp_ggml_compute_forward_sqrt(params, tensor->src0, tensor); + ggml_compute_forward_sqrt(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_LOG: + case GGML_OP_LOG: { - wsp_ggml_compute_forward_log(params, tensor->src0, tensor); + ggml_compute_forward_log(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_SUM: + case GGML_OP_SUM: { - wsp_ggml_compute_forward_sum(params, tensor->src0, tensor); + ggml_compute_forward_sum(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_SUM_ROWS: + case GGML_OP_SUM_ROWS: { - wsp_ggml_compute_forward_sum_rows(params, tensor->src0, tensor); + ggml_compute_forward_sum_rows(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_MEAN: + case GGML_OP_MEAN: { - wsp_ggml_compute_forward_mean(params, tensor->src0, tensor); + ggml_compute_forward_mean(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_ARGMAX: + case GGML_OP_ARGMAX: { - wsp_ggml_compute_forward_argmax(params, tensor->src0, tensor); + ggml_compute_forward_argmax(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_REPEAT: + case GGML_OP_REPEAT: { - wsp_ggml_compute_forward_repeat(params, tensor->src0, tensor); + ggml_compute_forward_repeat(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_REPEAT_BACK: + case GGML_OP_REPEAT_BACK: { - wsp_ggml_compute_forward_repeat_back(params, tensor->src0, tensor); + ggml_compute_forward_repeat_back(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_ABS: + case GGML_OP_CONCAT: { - wsp_ggml_compute_forward_abs(params, tensor->src0, tensor); + ggml_compute_forward_concat(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_SGN: + case GGML_OP_SILU_BACK: { - wsp_ggml_compute_forward_sgn(params, tensor->src0, tensor); + ggml_compute_forward_silu_back(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_NEG: + case GGML_OP_NORM: { - wsp_ggml_compute_forward_neg(params, tensor->src0, tensor); + ggml_compute_forward_norm(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_STEP: + case GGML_OP_RMS_NORM: { - wsp_ggml_compute_forward_step(params, tensor->src0, tensor); + ggml_compute_forward_rms_norm(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_TANH: + case GGML_OP_RMS_NORM_BACK: { - wsp_ggml_compute_forward_tanh(params, tensor->src0, tensor); + ggml_compute_forward_rms_norm_back(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_ELU: + case GGML_OP_GROUP_NORM: { - wsp_ggml_compute_forward_elu(params, tensor->src0, tensor); + ggml_compute_forward_group_norm(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_RELU: + case GGML_OP_MUL_MAT: { - wsp_ggml_compute_forward_relu(params, tensor->src0, tensor); + ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_GELU: + case GGML_OP_OUT_PROD: { - wsp_ggml_compute_forward_gelu(params, tensor->src0, tensor); + ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_GELU_QUICK: + case GGML_OP_SCALE: { - wsp_ggml_compute_forward_gelu_quick(params, tensor->src0, tensor); + ggml_compute_forward_scale(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_SILU: + case GGML_OP_SET: { - wsp_ggml_compute_forward_silu(params, tensor->src0, tensor); + ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_SILU_BACK: + case GGML_OP_CPY: { - wsp_ggml_compute_forward_silu_back(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_cpy(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_NORM: + case GGML_OP_CONT: { - wsp_ggml_compute_forward_norm(params, tensor->src0, tensor); + ggml_compute_forward_cont(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_RMS_NORM: + case GGML_OP_RESHAPE: { - wsp_ggml_compute_forward_rms_norm(params, tensor->src0, tensor); + ggml_compute_forward_reshape(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_RMS_NORM_BACK: + case GGML_OP_VIEW: { - wsp_ggml_compute_forward_rms_norm_back(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_view(params, tensor->src[0]); } break; - case WSP_GGML_OP_MUL_MAT: + case GGML_OP_PERMUTE: { - wsp_ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_permute(params, tensor->src[0]); } break; - case WSP_GGML_OP_OUT_PROD: + case GGML_OP_TRANSPOSE: { - wsp_ggml_compute_forward_out_prod(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_transpose(params, tensor->src[0]); } break; - case WSP_GGML_OP_SCALE: + case GGML_OP_GET_ROWS: { - wsp_ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_get_rows(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_SET: + case GGML_OP_GET_ROWS_BACK: { - wsp_ggml_compute_forward_set(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_get_rows_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; - case WSP_GGML_OP_CPY: + case GGML_OP_DIAG: { - wsp_ggml_compute_forward_cpy(params, tensor->src0, tensor); + ggml_compute_forward_diag(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_CONT: + case GGML_OP_DIAG_MASK_INF: { - wsp_ggml_compute_forward_cont(params, tensor->src0, tensor); + ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_RESHAPE: + case GGML_OP_DIAG_MASK_ZERO: { - wsp_ggml_compute_forward_reshape(params, tensor->src0, tensor); + ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_VIEW: + case GGML_OP_SOFT_MAX: { - wsp_ggml_compute_forward_view(params, tensor->src0); + ggml_compute_forward_soft_max(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_PERMUTE: + case GGML_OP_SOFT_MAX_BACK: { - wsp_ggml_compute_forward_permute(params, tensor->src0); + ggml_compute_forward_soft_max_back(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_TRANSPOSE: + case GGML_OP_ROPE: { - wsp_ggml_compute_forward_transpose(params, tensor->src0); + ggml_compute_forward_rope(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_GET_ROWS: + case GGML_OP_ROPE_BACK: { - wsp_ggml_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_rope_back(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_GET_ROWS_BACK: + case GGML_OP_ALIBI: { - wsp_ggml_compute_forward_get_rows_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_alibi(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_DIAG: + case GGML_OP_CLAMP: { - wsp_ggml_compute_forward_diag(params, tensor->src0, tensor); + ggml_compute_forward_clamp(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_DIAG_MASK_INF: + case GGML_OP_CONV_1D: { - wsp_ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_CONV_2D: { - wsp_ggml_compute_forward_diag_mask_zero(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_SOFT_MAX: + case GGML_OP_CONV_TRANSPOSE_2D: { - wsp_ggml_compute_forward_soft_max(params, tensor->src0, tensor); + ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_SOFT_MAX_BACK: + case GGML_OP_POOL_1D: { - wsp_ggml_compute_forward_soft_max_back(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_pool_1d(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_ROPE: + case GGML_OP_POOL_2D: { - wsp_ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_pool_2d(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_ROPE_BACK: + case GGML_OP_UPSCALE: { - wsp_ggml_compute_forward_rope_back(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_upscale(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_ALIBI: + case GGML_OP_FLASH_ATTN: { - wsp_ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor); + const int32_t t = ggml_get_op_params_i32(tensor, 0); + GGML_ASSERT(t == 0 || t == 1); + const bool masked = t != 0; + ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor); } break; - case WSP_GGML_OP_CLAMP: + case GGML_OP_FLASH_FF: { - wsp_ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor); } break; - case WSP_GGML_OP_CONV_1D: + case GGML_OP_FLASH_ATTN_BACK: { - wsp_ggml_compute_forward_conv_1d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + int32_t t = ggml_get_op_params_i32(tensor, 0); + GGML_ASSERT(t == 0 || t == 1); + bool masked = t != 0; + ggml_compute_forward_flash_attn_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], masked, tensor); } break; - case WSP_GGML_OP_CONV_2D: + case GGML_OP_WIN_PART: { - wsp_ggml_compute_forward_conv_2d(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_win_part(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_FLASH_ATTN: + case GGML_OP_WIN_UNPART: { - const int32_t t = wsp_ggml_get_i32_1d(tensor->opt[1], 0); - WSP_GGML_ASSERT(t == 0 || t == 1); - const bool masked = t != 0; - wsp_ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor); + ggml_compute_forward_win_unpart(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_FLASH_FF: + case GGML_OP_UNARY: { - wsp_ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor); + ggml_compute_forward_unary(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_FLASH_ATTN_BACK: + case GGML_OP_GET_REL_POS: { - int32_t t = wsp_ggml_get_i32_1d(tensor->opt[2], 0); - WSP_GGML_ASSERT(t == 0 || t == 1); - bool masked = t != 0; - wsp_ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor); + ggml_compute_forward_get_rel_pos(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_WIN_PART: + case GGML_OP_ADD_REL_POS: { - wsp_ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor); + ggml_compute_forward_add_rel_pos(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; - case WSP_GGML_OP_WIN_UNPART: + case GGML_OP_MAP_UNARY: { - wsp_ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor); - } break; - case WSP_GGML_OP_MAP_UNARY: + ggml_unary_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun); + } + break; + case GGML_OP_MAP_BINARY: + { + ggml_binary_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM1_F32: + { + ggml_custom1_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun); + } + break; + case GGML_OP_MAP_CUSTOM2_F32: { - const wsp_ggml_unary_op_f32_t fun = *((wsp_ggml_unary_op_f32_t *)tensor->opt[0]->data); - wsp_ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun); + ggml_custom2_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun); } break; - case WSP_GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM3_F32: { - const wsp_ggml_binary_op_f32_t fun = *((wsp_ggml_binary_op_f32_t *)tensor->opt[0]->data); - wsp_ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun); + ggml_custom3_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); } break; - case WSP_GGML_OP_MAP_CUSTOM1: + case GGML_OP_MAP_CUSTOM1: { - const wsp_ggml_custom1_op_f32_t fun = *((wsp_ggml_custom1_op_f32_t *)tensor->opt[0]->data); - wsp_ggml_compute_forward_map_custom1(params, tensor->src0, tensor, fun); + ggml_compute_forward_map_custom1(params, tensor->src[0], tensor); } break; - case WSP_GGML_OP_MAP_CUSTOM2: + case GGML_OP_MAP_CUSTOM2: { - const wsp_ggml_custom2_op_f32_t fun = *((wsp_ggml_custom2_op_f32_t *)tensor->opt[0]->data); - wsp_ggml_compute_forward_map_custom2(params, tensor->src0, tensor->src1, tensor, fun); + ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_MAP_CUSTOM3: + case GGML_OP_MAP_CUSTOM3: { - const wsp_ggml_custom3_op_f32_t fun = *((wsp_ggml_custom3_op_f32_t *)tensor->opt[0]->data); - wsp_ggml_compute_forward_map_custom3(params, tensor->src0, tensor->src1, tensor->opt[1], tensor, fun); + ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; - case WSP_GGML_OP_CROSS_ENTROPY_LOSS: + case GGML_OP_CROSS_ENTROPY_LOSS: { - wsp_ggml_compute_forward_cross_entropy_loss(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_cross_entropy_loss(params, tensor->src[0], tensor->src[1], tensor); } break; - case WSP_GGML_OP_CROSS_ENTROPY_LOSS_BACK: + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - wsp_ggml_compute_forward_cross_entropy_loss_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor); + ggml_compute_forward_cross_entropy_loss_back(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); } break; - case WSP_GGML_OP_NONE: + case GGML_OP_NONE: { // nop } break; - case WSP_GGML_OP_COUNT: + case GGML_OP_COUNT: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } //////////////////////////////////////////////////////////////////////////////// -static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * tensor, bool inplace) { - struct wsp_ggml_tensor * src0 = tensor->src0; - struct wsp_ggml_tensor * src1 = tensor->src1; +static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) { + struct ggml_tensor * src0 = tensor->src[0]; + struct ggml_tensor * src1 = tensor->src[1]; switch (tensor->op) { - case WSP_GGML_OP_DUP: + case GGML_OP_DUP: { if (src0->grad) { - src0->grad = wsp_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); } } break; - case WSP_GGML_OP_ADD: + case GGML_OP_ADD: { if (src0->grad) { - src0->grad = wsp_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { - src1->grad = wsp_ggml_add_impl(ctx, src1->grad, tensor->grad, inplace); + src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace); } } break; - case WSP_GGML_OP_ADD1: + case GGML_OP_ADD1: { if (src0->grad) { - src0->grad = wsp_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { - src1->grad = wsp_ggml_add_impl(ctx, + src1->grad = ggml_add_impl(ctx, src1->grad, - wsp_ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean + ggml_mean(ctx, tensor->grad), // TODO: should probably be sum instead of mean inplace); } } break; - case WSP_GGML_OP_ACC: + case GGML_OP_ACC: { if (src0->grad) { - src0->grad = wsp_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { - WSP_GGML_ASSERT(wsp_ggml_nelements(tensor->opt[0]) == 5); - WSP_GGML_ASSERT(tensor->opt[0]->type == WSP_GGML_TYPE_I32); - const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0]; - const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1]; - const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2]; - const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3]; - - struct wsp_ggml_tensor * tensor_grad_view = wsp_ggml_view_4d(ctx, + const size_t nb1 = ((int32_t *) tensor->op_params)[0]; + const size_t nb2 = ((int32_t *) tensor->op_params)[1]; + const size_t nb3 = ((int32_t *) tensor->op_params)[2]; + const size_t offset = ((int32_t *) tensor->op_params)[3]; + + struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx, tensor->grad, src1->grad->ne[0], src1->grad->ne[1], @@ -15272,234 +16065,178 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ nb1, nb2, nb3, offset); src1->grad = - wsp_ggml_add_impl(ctx, + ggml_add_impl(ctx, src1->grad, - wsp_ggml_reshape(ctx, - wsp_ggml_cont(ctx, tensor_grad_view), + ggml_reshape(ctx, + ggml_cont(ctx, tensor_grad_view), src1->grad), inplace); } } break; - case WSP_GGML_OP_SUB: + case GGML_OP_SUB: { if (src0->grad) { - src0->grad = wsp_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { - src1->grad = wsp_ggml_sub_impl(ctx, src1->grad, tensor->grad, inplace); + src1->grad = ggml_sub_impl(ctx, src1->grad, tensor->grad, inplace); } } break; - case WSP_GGML_OP_MUL: + case GGML_OP_MUL: { if (src0->grad) { src0->grad = - wsp_ggml_add_impl(ctx, + ggml_add_impl(ctx, src0->grad, - wsp_ggml_mul(ctx, src1, tensor->grad), + ggml_mul(ctx, src1, tensor->grad), inplace); } if (src1->grad) { src1->grad = - wsp_ggml_add_impl(ctx, + ggml_add_impl(ctx, src1->grad, - wsp_ggml_mul(ctx, src0, tensor->grad), + ggml_mul(ctx, src0, tensor->grad), inplace); } } break; - case WSP_GGML_OP_DIV: + case GGML_OP_DIV: { if (src0->grad) { src0->grad = - wsp_ggml_add_impl(ctx, + ggml_add_impl(ctx, src0->grad, - wsp_ggml_div(ctx, tensor->grad, src1), + ggml_div(ctx, tensor->grad, src1), inplace); } if (src1->grad) { src1->grad = - wsp_ggml_sub_impl(ctx, + ggml_sub_impl(ctx, src1->grad, - wsp_ggml_mul(ctx, + ggml_mul(ctx, tensor->grad, - wsp_ggml_div(ctx, tensor, src1)), + ggml_div(ctx, tensor, src1)), inplace); } } break; - case WSP_GGML_OP_SQR: + case GGML_OP_SQR: { if (src0->grad) { src0->grad = - wsp_ggml_add_impl(ctx, + ggml_add_impl(ctx, src0->grad, - wsp_ggml_scale(ctx, - wsp_ggml_mul(ctx, src0, tensor->grad), - wsp_ggml_new_f32(ctx, 2.0f)), + ggml_scale(ctx, + ggml_mul(ctx, src0, tensor->grad), + ggml_new_f32(ctx, 2.0f)), inplace); } } break; - case WSP_GGML_OP_SQRT: + case GGML_OP_SQRT: { if (src0->grad) { src0->grad = - wsp_ggml_add_impl(ctx, + ggml_add_impl(ctx, src0->grad, - wsp_ggml_scale(ctx, - wsp_ggml_div(ctx, + ggml_scale(ctx, + ggml_div(ctx, tensor->grad, tensor), - wsp_ggml_new_f32(ctx, 0.5f)), + ggml_new_f32(ctx, 0.5f)), inplace); } } break; - case WSP_GGML_OP_LOG: + case GGML_OP_LOG: { if (src0->grad) { src0->grad = - wsp_ggml_add_impl(ctx, + ggml_add_impl(ctx, src0->grad, - wsp_ggml_div(ctx, + ggml_div(ctx, tensor->grad, src0), inplace); } } break; - case WSP_GGML_OP_SUM: + case GGML_OP_SUM: { if (src0->grad) { src0->grad = - wsp_ggml_add1_impl(ctx, + ggml_add1_impl(ctx, src0->grad, tensor->grad, inplace); } } break; - case WSP_GGML_OP_SUM_ROWS: + case GGML_OP_SUM_ROWS: { if (src0->grad) { src0->grad = - wsp_ggml_add_impl(ctx, + ggml_add_impl(ctx, src0->grad, - wsp_ggml_repeat(ctx, + ggml_repeat(ctx, tensor->grad, src0->grad), inplace); } } break; - case WSP_GGML_OP_MEAN: - case WSP_GGML_OP_ARGMAX: + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: { - WSP_GGML_ASSERT(false); // TODO: implement + GGML_ASSERT(false); // TODO: implement } break; - case WSP_GGML_OP_REPEAT: + case GGML_OP_REPEAT: { // necessary for llama if (src0->grad) { - src0->grad = wsp_ggml_add_impl(ctx, + src0->grad = ggml_add_impl(ctx, src0->grad, - wsp_ggml_repeat_back(ctx, tensor->grad, src0->grad), + ggml_repeat_back(ctx, tensor->grad, src0->grad), inplace); } } break; - case WSP_GGML_OP_REPEAT_BACK: + case GGML_OP_REPEAT_BACK: { if (src0->grad) { // TODO: test this - src0->grad = wsp_ggml_add_impl(ctx, + src0->grad = ggml_add_impl(ctx, src0->grad, - wsp_ggml_repeat(ctx, tensor->grad, src0->grad), + ggml_repeat(ctx, tensor->grad, src0->grad), inplace); } } break; - case WSP_GGML_OP_ABS: - { - if (src0->grad) { - src0->grad = - wsp_ggml_add_impl(ctx, - src0->grad, - wsp_ggml_mul(ctx, - wsp_ggml_sgn(ctx, src0), - tensor->grad), - inplace); - } - } break; - case WSP_GGML_OP_SGN: - { - if (src0->grad) { - // noop - } - } break; - case WSP_GGML_OP_NEG: - { - if (src0->grad) { - src0->grad = wsp_ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace); - } - } break; - case WSP_GGML_OP_STEP: - { - if (src0->grad) { - // noop - } - } break; - case WSP_GGML_OP_TANH: + case GGML_OP_CONCAT: { - WSP_GGML_ASSERT(false); // TODO: not implemented + GGML_ASSERT(false); // TODO: implement } break; - case WSP_GGML_OP_ELU: + case GGML_OP_SILU_BACK: { - WSP_GGML_ASSERT(false); // TODO: not implemented + GGML_ASSERT(false); // TODO: not implemented } break; - case WSP_GGML_OP_RELU: + case GGML_OP_NORM: { - if (src0->grad) { - src0->grad = wsp_ggml_sub_impl(ctx, - src0->grad, - wsp_ggml_mul(ctx, - wsp_ggml_step(ctx, src0), - tensor->grad), - inplace); - } - } break; - case WSP_GGML_OP_GELU: - { - WSP_GGML_ASSERT(false); // TODO: not implemented - } break; - case WSP_GGML_OP_GELU_QUICK: - { - WSP_GGML_ASSERT(false); // TODO: not implemented + GGML_ASSERT(false); // TODO: not implemented } break; - case WSP_GGML_OP_SILU: + case GGML_OP_RMS_NORM: { // necessary for llama if (src0->grad) { - src0->grad = wsp_ggml_add_impl(ctx, + float eps; + memcpy(&eps, tensor->op_params, sizeof(float)); + + src0->grad = ggml_add_impl(ctx, src0->grad, - wsp_ggml_silu_back(ctx, src0, tensor->grad), + ggml_rms_norm_back(ctx, src0, tensor->grad, eps), inplace); } } break; - case WSP_GGML_OP_SILU_BACK: - { - WSP_GGML_ASSERT(false); // TODO: not implemented - } break; - case WSP_GGML_OP_NORM: - { - WSP_GGML_ASSERT(false); // TODO: not implemented - } break; - case WSP_GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: { - // necessary for llama - if (src0->grad) { - src0->grad = wsp_ggml_add_impl(ctx, - src0->grad, - wsp_ggml_rms_norm_back(ctx, src0, tensor->grad), - inplace); - } + GGML_ASSERT(false); // TODO: not implemented } break; - case WSP_GGML_OP_RMS_NORM_BACK: + case GGML_OP_GROUP_NORM: { - WSP_GGML_ASSERT(false); // TODO: not implemented + GGML_ASSERT(false); // TODO: not implemented } break; - case WSP_GGML_OP_MUL_MAT: + case GGML_OP_MUL_MAT: { // https://cs231n.github.io/optimization-2/#staged // # forward pass @@ -15519,71 +16256,69 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ // necessary for llama if (src0->grad) { src0->grad = - wsp_ggml_add_impl(ctx, + ggml_add_impl(ctx, src0->grad, - wsp_ggml_out_prod(ctx, // [n,m] + ggml_out_prod(ctx, // [n,m] src1, // [n,p] tensor->grad), // [m,p] inplace); } if (src1->grad) { src1->grad = - wsp_ggml_add_impl(ctx, + ggml_add_impl(ctx, src1->grad, - // wsp_ggml_mul_mat(ctx, // [n,p] - // wsp_ggml_cont(ctx, // [m,n] - // wsp_ggml_transpose(ctx, src0)), // [m,n] + // ggml_mul_mat(ctx, // [n,p] + // ggml_cont(ctx, // [m,n] + // ggml_transpose(ctx, src0)), // [m,n] // tensor->grad), // [m,p] // // when src0 is bigger than tensor->grad (this is mostly the case in llama), // // avoid transpose of src0, rather transpose smaller tensor->grad - // // and then use wsp_ggml_out_prod - wsp_ggml_out_prod(ctx, // [n,p] + // // and then use ggml_out_prod + ggml_out_prod(ctx, // [n,p] src0, // [n,m] - wsp_ggml_transpose(ctx, // [p,m] + ggml_transpose(ctx, // [p,m] tensor->grad)), // [m,p] inplace); } } break; - case WSP_GGML_OP_OUT_PROD: + case GGML_OP_OUT_PROD: { - WSP_GGML_ASSERT(false); // TODO: not implemented + GGML_ASSERT(false); // TODO: not implemented } break; - case WSP_GGML_OP_SCALE: + case GGML_OP_SCALE: { // necessary for llama if (src0->grad) { src0->grad = - wsp_ggml_add_impl(ctx, + ggml_add_impl(ctx, src0->grad, - wsp_ggml_scale_impl(ctx, tensor->grad, src1, false), + ggml_scale_impl(ctx, tensor->grad, src1, false), inplace); } if (src1->grad) { src1->grad = - wsp_ggml_add_impl(ctx, + ggml_add_impl(ctx, src1->grad, - wsp_ggml_sum(ctx, wsp_ggml_mul_impl(ctx, tensor->grad, src0, false)), + ggml_sum(ctx, ggml_mul_impl(ctx, tensor->grad, src0, false)), inplace); } } break; - case WSP_GGML_OP_SET: + case GGML_OP_SET: { - WSP_GGML_ASSERT(wsp_ggml_nelements(tensor->opt[0]) == 5); - WSP_GGML_ASSERT(tensor->opt[0]->type == WSP_GGML_TYPE_I32); - const size_t nb1 = (( int32_t * ) tensor->opt[0]->data)[0]; - const size_t nb2 = (( int32_t * ) tensor->opt[0]->data)[1]; - const size_t nb3 = (( int32_t * ) tensor->opt[0]->data)[2]; - const size_t offset = (( int32_t * ) tensor->opt[0]->data)[3]; + const size_t nb1 = ((int32_t *) tensor->op_params)[0]; + const size_t nb2 = ((int32_t *) tensor->op_params)[1]; + const size_t nb3 = ((int32_t *) tensor->op_params)[2]; + const size_t offset = ((int32_t *) tensor->op_params)[3]; - struct wsp_ggml_tensor * tensor_grad_view = NULL; + struct ggml_tensor * tensor_grad_view = NULL; if (src0->grad || src1->grad) { - WSP_GGML_ASSERT(src0->type == tensor->type); - WSP_GGML_ASSERT(tensor->grad->type == tensor->type); - WSP_GGML_ASSERT(tensor->grad->type == src1->grad->type); + GGML_ASSERT(src0->type == tensor->type); + GGML_ASSERT(tensor->grad->type == tensor->type); + GGML_ASSERT(tensor->grad->type == src1->grad->type); - tensor_grad_view = wsp_ggml_view_4d(ctx, + tensor_grad_view = ggml_view_4d(ctx, tensor->grad, src1->grad->ne[0], src1->grad->ne[1], @@ -15593,26 +16328,26 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ } if (src0->grad) { - src0->grad = wsp_ggml_add_impl(ctx, + src0->grad = ggml_add_impl(ctx, src0->grad, - wsp_ggml_acc_impl(ctx, + ggml_acc_impl(ctx, tensor->grad, - wsp_ggml_neg(ctx, tensor_grad_view), + ggml_neg(ctx, tensor_grad_view), nb1, nb2, nb3, offset, false), inplace); } if (src1->grad) { src1->grad = - wsp_ggml_add_impl(ctx, + ggml_add_impl(ctx, src1->grad, - wsp_ggml_reshape(ctx, - wsp_ggml_cont(ctx, tensor_grad_view), + ggml_reshape(ctx, + ggml_cont(ctx, tensor_grad_view), src1->grad), inplace); } } break; - case WSP_GGML_OP_CPY: + case GGML_OP_CPY: { // necessary for llama // cpy overwrites value of src1 by src0 and returns view(src1) @@ -15620,39 +16355,38 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ // tensor = src0 * 1 + src1 * 0 if (src0->grad) { // dsrc0 = dtensor * 1 - src0->grad = wsp_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { // dsrc1 = dtensor * 0 -> noop } } break; - case WSP_GGML_OP_CONT: + case GGML_OP_CONT: { // same as cpy if (src0->grad) { - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0->grad)); - WSP_GGML_ASSERT(wsp_ggml_is_contiguous(tensor->grad)); - src0->grad = wsp_ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); + GGML_ASSERT(ggml_is_contiguous(src0->grad)); + GGML_ASSERT(ggml_is_contiguous(tensor->grad)); + src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); } } break; - case WSP_GGML_OP_RESHAPE: + case GGML_OP_RESHAPE: { // necessary for llama if (src0->grad) { src0->grad = - wsp_ggml_add_impl(ctx, src0->grad, - wsp_ggml_reshape(ctx, tensor->grad, src0->grad), + ggml_add_impl(ctx, src0->grad, + ggml_reshape(ctx, tensor->grad, src0->grad), inplace); } } break; - case WSP_GGML_OP_VIEW: + case GGML_OP_VIEW: { // necessary for llama if (src0->grad) { size_t offset; - WSP_GGML_ASSERT(sizeof(offset) <= wsp_ggml_nbytes(tensor->opt[0])); - memcpy(&offset, tensor->opt[0]->data, sizeof(offset)); + memcpy(&offset, tensor->op_params, sizeof(offset)); size_t nb1 = tensor->nb[1]; size_t nb2 = tensor->nb[2]; @@ -15660,26 +16394,26 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ if (src0->type != src0->grad->type) { // gradient is typically F32, but src0 could be other type - size_t ng = wsp_ggml_element_size(src0->grad); - size_t n0 = wsp_ggml_element_size(src0); - WSP_GGML_ASSERT(offset % n0 == 0); - WSP_GGML_ASSERT(nb1 % n0 == 0); - WSP_GGML_ASSERT(nb2 % n0 == 0); - WSP_GGML_ASSERT(nb3 % n0 == 0); + size_t ng = ggml_element_size(src0->grad); + size_t n0 = ggml_element_size(src0); + GGML_ASSERT(offset % n0 == 0); + GGML_ASSERT(nb1 % n0 == 0); + GGML_ASSERT(nb2 % n0 == 0); + GGML_ASSERT(nb3 % n0 == 0); offset = (offset / n0) * ng; nb1 = (nb1 / n0) * ng; nb2 = (nb2 / n0) * ng; nb3 = (nb3 / n0) * ng; } - src0->grad = wsp_ggml_acc_impl(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, inplace); + src0->grad = ggml_acc_impl(ctx, src0->grad, tensor->grad, nb1, nb2, nb3, offset, inplace); } } break; - case WSP_GGML_OP_PERMUTE: + case GGML_OP_PERMUTE: { // necessary for llama if (src0->grad) { - int32_t * axes = (int32_t *) tensor->opt[0]->data; + int32_t * axes = (int32_t *) tensor->op_params; int axis0 = axes[0] & 0x3; int axis1 = axes[1] & 0x3; int axis2 = axes[2] & 0x3; @@ -15690,8 +16424,8 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ axes_backward[axis2] = 2; axes_backward[axis3] = 3; src0->grad = - wsp_ggml_add_impl(ctx, src0->grad, - wsp_ggml_permute(ctx, + ggml_add_impl(ctx, src0->grad, + ggml_permute(ctx, tensor->grad, axes_backward[0], axes_backward[1], @@ -15700,169 +16434,194 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ inplace); } } break; - case WSP_GGML_OP_TRANSPOSE: + case GGML_OP_TRANSPOSE: { // necessary for llama if (src0->grad) { src0->grad = - wsp_ggml_add_impl(ctx, src0->grad, - wsp_ggml_transpose(ctx, tensor->grad), + ggml_add_impl(ctx, src0->grad, + ggml_transpose(ctx, tensor->grad), inplace); } } break; - case WSP_GGML_OP_GET_ROWS: + case GGML_OP_GET_ROWS: { // necessary for llama (only for tokenizer) if (src0->grad) { src0->grad = - wsp_ggml_add_impl(ctx, src0->grad, - wsp_ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad), + ggml_add_impl(ctx, src0->grad, + ggml_get_rows_back(ctx, tensor->grad, src1, src0->grad), inplace); } if (src1->grad) { // noop } } break; - case WSP_GGML_OP_GET_ROWS_BACK: + case GGML_OP_GET_ROWS_BACK: { - WSP_GGML_ASSERT(false); // TODO: not implemented + GGML_ASSERT(false); // TODO: not implemented } break; - case WSP_GGML_OP_DIAG: + case GGML_OP_DIAG: { - WSP_GGML_ASSERT(false); // TODO: not implemented + GGML_ASSERT(false); // TODO: not implemented } break; - case WSP_GGML_OP_DIAG_MASK_INF: + case GGML_OP_DIAG_MASK_INF: { // necessary for llama if (src0->grad) { - assert(src1->type == WSP_GGML_TYPE_I32); - assert(wsp_ggml_nelements(src1) == 2); - const int n_past = ((int32_t *) src1->data)[0]; + const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = - wsp_ggml_add_impl(ctx, src0->grad, - wsp_ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), + ggml_add_impl(ctx, src0->grad, + ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), inplace); } - if (src1->grad) { - // noop - } } break; - case WSP_GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_DIAG_MASK_ZERO: { // necessary for llama if (src0->grad) { - assert(src1->type == WSP_GGML_TYPE_I32); - assert(wsp_ggml_nelements(src1) == 2); - const int n_past = ((int32_t *) src1->data)[0]; + const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = - wsp_ggml_add_impl(ctx, src0->grad, - wsp_ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), + ggml_add_impl(ctx, src0->grad, + ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), inplace); } - if (src1->grad) { - // noop - } } break; - case WSP_GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX: { // necessary for llama if (src0->grad) { src0->grad = - wsp_ggml_add_impl(ctx, src0->grad, - wsp_ggml_soft_max_back(ctx, tensor->grad, tensor), + ggml_add_impl(ctx, src0->grad, + ggml_soft_max_back(ctx, tensor->grad, tensor), inplace); } } break; - case WSP_GGML_OP_SOFT_MAX_BACK: + case GGML_OP_SOFT_MAX_BACK: { - WSP_GGML_ASSERT(false); // TODO: not implemented + GGML_ASSERT(false); // TODO: not implemented } break; - case WSP_GGML_OP_ROPE: + case GGML_OP_ROPE: { // necessary for llama if (src0->grad) { - assert(src1->type == WSP_GGML_TYPE_I32); - assert(wsp_ggml_nelements(src1) == 4); - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - src0->grad = wsp_ggml_add_impl(ctx, + const int n_past = ((int32_t *) tensor->op_params)[0]; + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; + float freq_base; + float freq_scale; + float xpos_base; + bool xpos_down; + memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); + memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); + memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); + + src0->grad = ggml_add_impl(ctx, src0->grad, - wsp_ggml_rope_back(ctx, + ggml_rope_back(ctx, tensor->grad, n_past, n_dims, - mode), + mode, + n_ctx, + freq_base, + freq_scale, + xpos_base, + xpos_down), inplace); } - if (src1->grad) { - // noop - } } break; - case WSP_GGML_OP_ROPE_BACK: + case GGML_OP_ROPE_BACK: { if (src0->grad) { - assert(src1->type == WSP_GGML_TYPE_I32); - assert(wsp_ggml_nelements(src1) == 4); - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; - src0->grad = wsp_ggml_add_impl(ctx, + const int n_past = ((int32_t *) tensor->op_params)[0]; + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; + float freq_base; + float freq_scale; + float xpos_base; + bool xpos_down; + memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); + memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); + memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); + + src0->grad = ggml_add_impl(ctx, src0->grad, - wsp_ggml_rope(ctx, + ggml_rope_impl(ctx, tensor->grad, n_past, n_dims, mode, - n_ctx), + n_ctx, + freq_base, + freq_scale, + xpos_base, + xpos_down, + false), inplace); } - if (src1->grad) { - // noop - } } break; - case WSP_GGML_OP_ALIBI: + case GGML_OP_ALIBI: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CLAMP: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_1D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_2D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_TRANSPOSE_2D: { - WSP_GGML_ASSERT(false); // TODO: not implemented + GGML_ASSERT(false); // TODO: not implemented } break; - case WSP_GGML_OP_CLAMP: + case GGML_OP_POOL_1D: { - WSP_GGML_ASSERT(false); // TODO: not implemented + GGML_ASSERT(false); // TODO: not implemented } break; - case WSP_GGML_OP_CONV_1D: + case GGML_OP_POOL_2D: { - WSP_GGML_ASSERT(false); // TODO: not implemented + GGML_ASSERT(false); // TODO: not implemented } break; - case WSP_GGML_OP_CONV_2D: + case GGML_OP_UPSCALE: { - WSP_GGML_ASSERT(false); // TODO: not implemented + GGML_ASSERT(false); // TODO: not implemented } break; - case WSP_GGML_OP_FLASH_ATTN: + case GGML_OP_FLASH_ATTN: { - struct wsp_ggml_tensor * flash_grad = NULL; - if (src0->grad || src1->grad || tensor->opt[0]->grad) { - int32_t t = wsp_ggml_get_i32_1d(tensor->opt[1], 0); - WSP_GGML_ASSERT(t == 0 || t == 1); + struct ggml_tensor * flash_grad = NULL; + if (src0->grad || src1->grad || tensor->src[2]->grad) { + int32_t t = ggml_get_op_params_i32(tensor, 0); + GGML_ASSERT(t == 0 || t == 1); bool masked = t != 0; flash_grad = - wsp_ggml_flash_attn_back(ctx, + ggml_flash_attn_back(ctx, src0, src1, - tensor->opt[0], + tensor->src[2], tensor->grad, masked); } if (src0->grad) { - struct wsp_ggml_tensor * grad_q = NULL; + struct ggml_tensor * grad_q = NULL; const size_t nb0 = flash_grad->nb[0]; const size_t offset = 0; switch(src0->n_dims) { case 2: { - grad_q = wsp_ggml_view_2d(ctx, + grad_q = ggml_view_2d(ctx, flash_grad, src0->ne[0], src0->ne[1], @@ -15871,7 +16630,7 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ } break; case 3: { - grad_q = wsp_ggml_view_3d(ctx, + grad_q = ggml_view_3d(ctx, flash_grad, src0->ne[0], src0->ne[1], @@ -15882,7 +16641,7 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ } break; case 4: { - grad_q = wsp_ggml_view_4d(ctx, + grad_q = ggml_view_4d(ctx, flash_grad, src0->ne[0], src0->ne[1], @@ -15895,20 +16654,20 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ } break; } - src0->grad = wsp_ggml_add_impl(ctx, + src0->grad = ggml_add_impl(ctx, src0->grad, grad_q, inplace); } if (src1->grad) { - struct wsp_ggml_tensor * grad_k = NULL; + struct ggml_tensor * grad_k = NULL; const size_t nb0 = flash_grad->nb[0]; const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3]; switch(src1->n_dims) { case 2: { - grad_k = wsp_ggml_view_2d(ctx, + grad_k = ggml_view_2d(ctx, flash_grad, src1->ne[0], src1->ne[1], @@ -15917,7 +16676,7 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ } break; case 3: { - grad_k = wsp_ggml_view_3d(ctx, + grad_k = ggml_view_3d(ctx, flash_grad, src1->ne[0], src1->ne[1], @@ -15928,7 +16687,7 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ } break; case 4: { - grad_k = wsp_ggml_view_4d(ctx, + grad_k = ggml_view_4d(ctx, flash_grad, src1->ne[0], src1->ne[1], @@ -15941,23 +16700,23 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ } break; } - src1->grad = wsp_ggml_add_impl(ctx, + src1->grad = ggml_add_impl(ctx, src1->grad, grad_k, inplace); } - struct wsp_ggml_tensor * opt0 = tensor->opt[0]; + struct ggml_tensor * opt0 = tensor->src[2]; if (opt0->grad) { - struct wsp_ggml_tensor * grad_v = NULL; + struct ggml_tensor * grad_v = NULL; const size_t nb0 = flash_grad->nb[0]; const size_t offset = nb0*src0->ne[0]*src0->ne[1]*src0->ne[2]*src0->ne[3] + nb0*src1->ne[0]*src1->ne[1]*src1->ne[2]*src1->ne[3]; switch(opt0->n_dims) { case 2: { - grad_v = wsp_ggml_view_2d(ctx, + grad_v = ggml_view_2d(ctx, flash_grad, opt0->ne[0], opt0->ne[1], @@ -15966,7 +16725,7 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ } break; case 3: { - grad_v = wsp_ggml_view_3d(ctx, + grad_v = ggml_view_3d(ctx, flash_grad, opt0->ne[0], opt0->ne[1], @@ -15977,7 +16736,7 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ } break; case 4: { - grad_v = wsp_ggml_view_4d(ctx, + grad_v = ggml_view_4d(ctx, flash_grad, opt0->ne[0], opt0->ne[1], @@ -15990,108 +16749,199 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_ } break; } - opt0->grad = wsp_ggml_add_impl(ctx, + opt0->grad = ggml_add_impl(ctx, opt0->grad, grad_v, inplace); } } break; - case WSP_GGML_OP_FLASH_FF: + case GGML_OP_FLASH_FF: + { + GGML_ASSERT(false); // not supported + } break; + case GGML_OP_FLASH_ATTN_BACK: { - WSP_GGML_ASSERT(false); // not supported + GGML_ASSERT(false); // not supported } break; - case WSP_GGML_OP_FLASH_ATTN_BACK: + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: + case GGML_OP_UNARY: { - WSP_GGML_ASSERT(false); // not supported + switch (ggml_get_unary_op(tensor)) { + case GGML_UNARY_OP_ABS: + { + if (src0->grad) { + src0->grad = + ggml_add_impl(ctx, + src0->grad, + ggml_mul(ctx, + ggml_sgn(ctx, src0), + tensor->grad), + inplace); + } + } break; + case GGML_UNARY_OP_SGN: + { + if (src0->grad) { + // noop + } + } break; + case GGML_UNARY_OP_NEG: + { + if (src0->grad) { + src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace); + } + } break; + case GGML_UNARY_OP_STEP: + { + if (src0->grad) { + // noop + } + } break; + case GGML_UNARY_OP_TANH: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_UNARY_OP_ELU: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_UNARY_OP_RELU: + { + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_mul(ctx, + ggml_step(ctx, src0), + tensor->grad), + inplace); + } + } break; + case GGML_UNARY_OP_GELU: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_UNARY_OP_GELU_QUICK: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_UNARY_OP_SILU: + { + // necessary for llama + if (src0->grad) { + src0->grad = ggml_add_impl(ctx, + src0->grad, + ggml_silu_back(ctx, src0, tensor->grad), + inplace); + } + } break; + default: + GGML_ASSERT(false); + } } break; - case WSP_GGML_OP_WIN_PART: - case WSP_GGML_OP_WIN_UNPART: - case WSP_GGML_OP_MAP_UNARY: - case WSP_GGML_OP_MAP_BINARY: - case WSP_GGML_OP_MAP_CUSTOM1: - case WSP_GGML_OP_MAP_CUSTOM2: - case WSP_GGML_OP_MAP_CUSTOM3: + case GGML_OP_GET_REL_POS: + case GGML_OP_ADD_REL_POS: + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: + case GGML_OP_MAP_CUSTOM1: + case GGML_OP_MAP_CUSTOM2: + case GGML_OP_MAP_CUSTOM3: { - WSP_GGML_ASSERT(false); // not supported + GGML_ASSERT(false); // not supported } break; - case WSP_GGML_OP_CROSS_ENTROPY_LOSS: + case GGML_OP_CROSS_ENTROPY_LOSS: { if (src0->grad) { - src0->grad = wsp_ggml_add_impl(ctx, + src0->grad = ggml_add_impl(ctx, src0->grad, - wsp_ggml_cross_entropy_loss_back(ctx, + ggml_cross_entropy_loss_back(ctx, src0, src1, tensor->grad), inplace); } } break; - case WSP_GGML_OP_CROSS_ENTROPY_LOSS_BACK: + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - WSP_GGML_ASSERT(false); // not supported + GGML_ASSERT(false); // not supported } break; - case WSP_GGML_OP_NONE: + case GGML_OP_NONE: { // nop } break; - case WSP_GGML_OP_COUNT: + case GGML_OP_COUNT: { - WSP_GGML_ASSERT(false); + GGML_ASSERT(false); } break; } } -static void wsp_ggml_visit_parents(struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_tensor * node) { - if (node->grad == NULL) { - // this usually happens when we generate intermediate nodes from constants in the backward pass - // it can also happen during forward pass, if the user performs computations with constants - if (node->op != WSP_GGML_OP_NONE) { - //WSP_GGML_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op); - } - } +static_assert(GGML_GRAPH_HASHTABLE_SIZE > GGML_MAX_NODES * 2, "GGML_GRAPH_HT_SIZE is too small"); - // check if already visited - for (int i = 0; i < cgraph->n_nodes; i++) { - if (cgraph->nodes[i] == node) { - return; +static size_t hash(void * p) { + return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; +} + +static bool hash_insert(void * hash_table[], void * p) { + size_t h = hash(p); + + // linear probing + size_t i = h; + while (hash_table[i] != NULL && hash_table[i] != p) { + i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE; + if (i == h) { + // hash table is full + GGML_ASSERT(false); } } - for (int i = 0; i < cgraph->n_leafs; i++) { - if (cgraph->leafs[i] == node) { - return; - } + if (hash_table[i] == p) { + return true; } - if (node->src0) { - wsp_ggml_visit_parents(cgraph, node->src0); + // insert + hash_table[i] = p; + return false; +} + +static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) { + if (node->grad == NULL) { + // this usually happens when we generate intermediate nodes from constants in the backward pass + // it can also happen during forward pass, if the user performs computations with constants + if (node->op != GGML_OP_NONE) { + //GGML_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op); + } } - if (node->src1) { - wsp_ggml_visit_parents(cgraph, node->src1); + // check if already visited + if (hash_insert(cgraph->visited_hash_table, node)) { + return; } - for (int i = 0; i < WSP_GGML_MAX_OPT; ++i) { - if (node->opt[i]) { - wsp_ggml_visit_parents(cgraph, node->opt[i]); + for (int i = 0; i < GGML_MAX_SRC; ++i) { + if (node->src[i]) { + ggml_visit_parents(cgraph, node->src[i]); } } - if (node->op == WSP_GGML_OP_NONE && node->grad == NULL) { + if (node->op == GGML_OP_NONE && node->grad == NULL) { // reached a leaf node, not part of the gradient graph (e.g. a constant) - WSP_GGML_ASSERT(cgraph->n_leafs < WSP_GGML_MAX_NODES); + GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); if (strlen(node->name) == 0) { - wsp_ggml_format_name(node, "leaf_%d", cgraph->n_leafs); + ggml_format_name(node, "leaf_%d", cgraph->n_leafs); } cgraph->leafs[cgraph->n_leafs] = node; cgraph->n_leafs++; } else { - WSP_GGML_ASSERT(cgraph->n_nodes < WSP_GGML_MAX_NODES); + GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES); if (strlen(node->name) == 0) { - wsp_ggml_format_name(node, "node_%d", cgraph->n_nodes); + ggml_format_name(node, "node_%d", cgraph->n_nodes); } cgraph->nodes[cgraph->n_nodes] = node; @@ -16100,7 +16950,7 @@ static void wsp_ggml_visit_parents(struct wsp_ggml_cgraph * cgraph, struct wsp_g } } -static void wsp_ggml_build_forward_impl(struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_tensor * tensor, bool expand) { +static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) { if (!expand) { cgraph->n_nodes = 0; cgraph->n_leafs = 0; @@ -16109,79 +16959,108 @@ static void wsp_ggml_build_forward_impl(struct wsp_ggml_cgraph * cgraph, struct const int n0 = cgraph->n_nodes; UNUSED(n0); - wsp_ggml_visit_parents(cgraph, tensor); + ggml_visit_parents(cgraph, tensor); const int n_new = cgraph->n_nodes - n0; - WSP_GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new); + GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new); if (n_new > 0) { // the last added node should always be starting point - WSP_GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor); + GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor); } } -void wsp_ggml_build_forward_expand(struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_tensor * tensor) { - wsp_ggml_build_forward_impl(cgraph, tensor, true); +void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) { + ggml_build_forward_impl(cgraph, tensor, true); } -struct wsp_ggml_cgraph wsp_ggml_build_forward(struct wsp_ggml_tensor * tensor) { - struct wsp_ggml_cgraph result = { +struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { + struct ggml_cgraph result = { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, - /*.n_threads =*/ WSP_GGML_DEFAULT_N_THREADS, - /*.work_size =*/ 0, - /*.work =*/ NULL, /*.nodes =*/ { NULL }, /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, + /*.hash_table =*/ { NULL }, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, }; - wsp_ggml_build_forward_impl(&result, tensor, false); + ggml_build_forward_impl(&result, tensor, false); return result; } -struct wsp_ggml_cgraph wsp_ggml_build_backward(struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * gf, bool keep) { - struct wsp_ggml_cgraph result = *gf; - - WSP_GGML_ASSERT(gf->n_nodes > 0); +void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) { + GGML_ASSERT(gf->n_nodes > 0); // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph if (keep) { for (int i = 0; i < gf->n_nodes; i++) { - struct wsp_ggml_tensor * node = gf->nodes[i]; + struct ggml_tensor * node = gf->nodes[i]; if (node->grad) { - node->grad = wsp_ggml_dup_tensor(ctx, node); + node->grad = ggml_dup_tensor(ctx, node); gf->grads[i] = node->grad; } } } for (int i = gf->n_nodes - 1; i >= 0; i--) { - struct wsp_ggml_tensor * node = gf->nodes[i]; + struct ggml_tensor * node = gf->nodes[i]; // because we detached the grad nodes from the original graph, we can afford inplace operations if (node->grad) { - wsp_ggml_compute_backward(ctx, node, keep); + ggml_compute_backward(ctx, node, keep); } } - for (int i = gf->n_nodes - 1; i >= 0; i--) { - struct wsp_ggml_tensor * node = gf->nodes[i]; + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; if (node->is_param) { - WSP_GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); - wsp_ggml_build_forward_impl(&result, node->grad, true); + GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); + ggml_build_forward_expand(gb, node->grad); } } +} +struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) { + struct ggml_cgraph result = *gf; + ggml_build_backward_expand(ctx, gf, &result, keep); return result; } +struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) { + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE); + struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs); + + *cgraph = (struct ggml_cgraph) { + /*.n_nodes =*/ 0, + /*.n_leafs =*/ 0, + /*.nodes =*/ { NULL }, + /*.grads =*/ { NULL }, + /*.leafs =*/ { NULL }, + /*.hash_table =*/ { NULL }, + /*.perf_runs =*/ 0, + /*.perf_cycles =*/ 0, + /*.perf_time_us =*/ 0, + }; + + return cgraph; +} + +struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) { + struct ggml_cgraph * cgraph = ggml_new_graph(ctx); + ggml_build_forward_impl(cgraph, tensor, false); + return cgraph; +} + +size_t ggml_graph_overhead(void) { + return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN); +} + // // thread data // @@ -16193,68 +17072,68 @@ struct wsp_ggml_cgraph wsp_ggml_build_backward(struct wsp_ggml_context * ctx, st //#include // -//typedef os_unfair_lock wsp_ggml_lock_t; +//typedef os_unfair_lock ggml_lock_t; // -//#define wsp_ggml_lock_init(x) UNUSED(x) -//#define wsp_ggml_lock_destroy(x) UNUSED(x) -//#define wsp_ggml_lock_lock os_unfair_lock_lock -//#define wsp_ggml_lock_unlock os_unfair_lock_unlock +//#define ggml_lock_init(x) UNUSED(x) +//#define ggml_lock_destroy(x) UNUSED(x) +//#define ggml_lock_lock os_unfair_lock_lock +//#define ggml_lock_unlock os_unfair_lock_unlock // -//#define WSP_GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT +//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT -typedef int wsp_ggml_lock_t; +typedef int ggml_lock_t; -#define wsp_ggml_lock_init(x) UNUSED(x) -#define wsp_ggml_lock_destroy(x) UNUSED(x) -#define wsp_ggml_lock_lock(x) UNUSED(x) -#define wsp_ggml_lock_unlock(x) UNUSED(x) +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) +#define ggml_lock_lock(x) UNUSED(x) +#define ggml_lock_unlock(x) UNUSED(x) -#define WSP_GGML_LOCK_INITIALIZER 0 +#define GGML_LOCK_INITIALIZER 0 -typedef pthread_t wsp_ggml_thread_t; +typedef pthread_t ggml_thread_t; -#define wsp_ggml_thread_create pthread_create -#define wsp_ggml_thread_join pthread_join +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join #else -//typedef pthread_spinlock_t wsp_ggml_lock_t; +//typedef pthread_spinlock_t ggml_lock_t; -//#define wsp_ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) -//#define wsp_ggml_lock_destroy pthread_spin_destroy -//#define wsp_ggml_lock_lock pthread_spin_lock -//#define wsp_ggml_lock_unlock pthread_spin_unlock +//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) +//#define ggml_lock_destroy pthread_spin_destroy +//#define ggml_lock_lock pthread_spin_lock +//#define ggml_lock_unlock pthread_spin_unlock -typedef int wsp_ggml_lock_t; +typedef int ggml_lock_t; -#define wsp_ggml_lock_init(x) UNUSED(x) -#define wsp_ggml_lock_destroy(x) UNUSED(x) +#define ggml_lock_init(x) UNUSED(x) +#define ggml_lock_destroy(x) UNUSED(x) #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) -#define wsp_ggml_lock_lock(x) _mm_pause() +#define ggml_lock_lock(x) _mm_pause() #else -#define wsp_ggml_lock_lock(x) UNUSED(x) +#define ggml_lock_lock(x) UNUSED(x) #endif -#define wsp_ggml_lock_unlock(x) UNUSED(x) +#define ggml_lock_unlock(x) UNUSED(x) -#define WSP_GGML_LOCK_INITIALIZER 0 +#define GGML_LOCK_INITIALIZER 0 -typedef pthread_t wsp_ggml_thread_t; +typedef pthread_t ggml_thread_t; -#define wsp_ggml_thread_create pthread_create -#define wsp_ggml_thread_join pthread_join +#define ggml_thread_create pthread_create +#define ggml_thread_join pthread_join #endif // Android's libc implementation "bionic" does not support setting affinity #if defined(__linux__) && !defined(__BIONIC__) -void set_numa_thread_affinity(int thread_n, int n_threads) { - if (!wsp_ggml_is_numa()) { +static void set_numa_thread_affinity(int thread_n, int n_threads) { + if (!ggml_is_numa()) { return; } // run thread on node_num thread_n / (threads per node) const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes); - struct wsp_ggml_numa_node * node = &g_state.numa.nodes[node_num]; + struct ggml_numa_node * node = &g_state.numa.nodes[node_num]; size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus); cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); @@ -16272,8 +17151,8 @@ void set_numa_thread_affinity(int thread_n, int n_threads) { CPU_FREE(cpus); } -void clear_numa_thread_affinity(void) { - if (!wsp_ggml_is_numa()) { +static void clear_numa_thread_affinity(void) { + if (!ggml_is_numa()) { return; } @@ -16296,100 +17175,118 @@ void clear_numa_thread_affinity(void) { #else // TODO: Windows etc. // (the linux implementation may also work on BSD, someone should test) -void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); } -void clear_numa_thread_affinity(void) {} +static void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); } +static void clear_numa_thread_affinity(void) {} #endif -struct wsp_ggml_compute_state_shared { - struct wsp_ggml_cgraph * cgraph; +struct ggml_compute_state_shared { + const struct ggml_cgraph * cgraph; + const struct ggml_cplan * cplan; int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; - int n_threads; + const int n_threads; // synchronization primitives atomic_int n_active; // num active threads atomic_int node_n; // active graph node + + bool (*abort_callback)(void * data); // abort ggml_graph_compute when true + void * abort_callback_data; }; -struct wsp_ggml_compute_state { - wsp_ggml_thread_t thrd; +struct ggml_compute_state { + ggml_thread_t thrd; int ith; - struct wsp_ggml_compute_state_shared * shared; + struct ggml_compute_state_shared * shared; }; -static void wsp_ggml_graph_compute_perf_stats_node(struct wsp_ggml_tensor * node, const struct wsp_ggml_compute_state_shared * st) { - int64_t cycles_cur = wsp_ggml_perf_cycles() - st->perf_node_start_cycles; - int64_t time_us_cur = wsp_ggml_perf_time_us() - st->perf_node_start_time_us; +static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { + int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles; + int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us; node->perf_runs++; node->perf_cycles += cycles_cur; node->perf_time_us += time_us_cur; } -static thread_ret_t wsp_ggml_graph_compute_thread(void * data) { - struct wsp_ggml_compute_state * state = (struct wsp_ggml_compute_state *) data; - struct wsp_ggml_cgraph * cgraph = state->shared->cgraph; +static thread_ret_t ggml_graph_compute_thread(void * data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + + const struct ggml_cgraph * cgraph = state->shared->cgraph; + const struct ggml_cplan * cplan = state->shared->cplan; + + const int * n_tasks_arr = cplan->n_tasks; + const int n_threads = state->shared->n_threads; - const int n_threads = state->shared->n_threads; set_numa_thread_affinity(state->ith, n_threads); int node_n = -1; while (true) { + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + state->shared->node_n += 1; + return (thread_ret_t) GGML_EXIT_ABORTED; + } if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { // all other threads are finished and spinning // do finalize and init here so we don't have synchronize again - struct wsp_ggml_compute_params params = { - /*.type =*/ WSP_GGML_TASK_FINALIZE, + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, - /*.wsize =*/ cgraph->work ? wsp_ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; if (node_n != -1) { /* FINALIZE */ - struct wsp_ggml_tensor * node = state->shared->cgraph->nodes[node_n]; - if (WSP_GGML_OP_HAS_FINALIZE[node->op]) { - params.nth = node->n_tasks; - wsp_ggml_compute_forward(¶ms, node); - wsp_ggml_graph_compute_perf_stats_node(node, state->shared); + struct ggml_tensor * node = state->shared->cgraph->nodes[node_n]; + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.nth = n_tasks_arr[node_n]; + ggml_compute_forward(¶ms, node); } + ggml_graph_compute_perf_stats_node(node, state->shared); } // distribute new work or execute it direct if 1T while (++node_n < cgraph->n_nodes) { - WSP_GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); + GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); - struct wsp_ggml_tensor * node = cgraph->nodes[node_n]; + struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; - state->shared->perf_node_start_cycles = wsp_ggml_perf_cycles(); - state->shared->perf_node_start_time_us = wsp_ggml_perf_time_us(); + state->shared->perf_node_start_cycles = ggml_perf_cycles(); + state->shared->perf_node_start_time_us = ggml_perf_time_us(); - params.nth = node->n_tasks; + params.nth = n_tasks; /* INIT */ - if (WSP_GGML_OP_HAS_INIT[node->op]) { - params.type = WSP_GGML_TASK_INIT; - wsp_ggml_compute_forward(¶ms, node); + if (GGML_OP_HAS_INIT[node->op]) { + params.type = GGML_TASK_INIT; + ggml_compute_forward(¶ms, node); } - if (node->n_tasks == 1) { + if (n_tasks == 1) { // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // they do something more efficient than spinning (?) - params.type = WSP_GGML_TASK_COMPUTE; - wsp_ggml_compute_forward(¶ms, node); + params.type = GGML_TASK_COMPUTE; + ggml_compute_forward(¶ms, node); - if (WSP_GGML_OP_HAS_FINALIZE[node->op]) { - params.type = WSP_GGML_TASK_FINALIZE; - wsp_ggml_compute_forward(¶ms, node); - wsp_ggml_graph_compute_perf_stats_node(node, state->shared); + if (GGML_OP_HAS_FINALIZE[node->op]) { + params.type = GGML_TASK_FINALIZE; + ggml_compute_forward(¶ms, node); } + + ggml_graph_compute_perf_stats_node(node, state->shared); } else { break; } + + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + break; + } } atomic_store(&state->shared->n_active, n_threads); @@ -16397,459 +17294,558 @@ static thread_ret_t wsp_ggml_graph_compute_thread(void * data) { } else { // wait for other threads to finish const int last = node_n; - do { + while (true) { + // TODO: this sched_yield can have significant impact on the performance - either positive or negative + // depending on the workload and the operating system. + // since it is not clear what is the best approach, it should potentially become user-configurable + // ref: https://github.com/ggerganov/ggml/issues/291 +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) sched_yield(); +#endif + node_n = atomic_load(&state->shared->node_n); - } while (node_n == last); + if (node_n != last) break; + }; } // check if we should stop if (node_n >= cgraph->n_nodes) break; /* COMPUTE */ - struct wsp_ggml_tensor * node = cgraph->nodes[node_n]; + struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; - struct wsp_ggml_compute_params params = { - /*.type =*/ WSP_GGML_TASK_COMPUTE, + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_COMPUTE, /*.ith =*/ state->ith, - /*.nth =*/ node->n_tasks, - /*.wsize =*/ cgraph->work ? wsp_ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.nth =*/ n_tasks, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; - if (state->ith < node->n_tasks) { - wsp_ggml_compute_forward(¶ms, node); + if (state->ith < n_tasks) { + ggml_compute_forward(¶ms, node); } } - return 0; + return GGML_EXIT_SUCCESS; } -void wsp_ggml_graph_compute(struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * cgraph) { - const int n_threads = cgraph->n_threads; - - struct wsp_ggml_compute_state_shared state_shared = { - /*.cgraph =*/ cgraph, - /*.perf_node_start_cycles =*/ 0, - /*.perf_node_start_time_us =*/ 0, - /*.n_threads =*/ n_threads, - /*.n_active =*/ n_threads, - /*.node_n =*/ -1, - }; - struct wsp_ggml_compute_state * workers = alloca(sizeof(struct wsp_ggml_compute_state)*n_threads); - - // initialize tasks + work buffer - { - size_t work_size = 0; +struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { + if (n_threads <= 0) { + n_threads = GGML_DEFAULT_N_THREADS; + } - // thread scheduling for the different operations - for (int i = 0; i < cgraph->n_nodes; i++) { - struct wsp_ggml_tensor * node = cgraph->nodes[i]; + size_t work_size = 0; - switch (node->op) { - case WSP_GGML_OP_CPY: - case WSP_GGML_OP_DUP: - { - node->n_tasks = n_threads; + struct ggml_cplan cplan; + memset(&cplan, 0, sizeof(struct ggml_cplan)); - size_t cur = 0; - if (wsp_ggml_is_quantized(node->type)) { - cur = WSP_GGML_TYPE_SIZE[WSP_GGML_TYPE_F32] * node->ne[0] * n_threads; - } + // thread scheduling for the different operations + work buffer size estimation + for (int i = 0; i < cgraph->n_nodes; i++) { + int n_tasks = 1; - work_size = MAX(work_size, cur); - } break; - case WSP_GGML_OP_ADD: - case WSP_GGML_OP_ADD1: - { - node->n_tasks = n_threads; + struct ggml_tensor * node = cgraph->nodes[i]; - size_t cur = 0; + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + { + n_tasks = n_threads; - if (wsp_ggml_is_quantized(node->src0->type)) { - cur = WSP_GGML_TYPE_SIZE[WSP_GGML_TYPE_F32] * node->src0->ne[0] * n_threads; - } + size_t cur = 0; + if (ggml_is_quantized(node->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; + } - work_size = MAX(work_size, cur); - } break; - case WSP_GGML_OP_ACC: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ADD: + case GGML_OP_ADD1: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - if (wsp_ggml_is_quantized(node->src0->type)) { - cur = WSP_GGML_TYPE_SIZE[WSP_GGML_TYPE_F32] * node->src1->ne[0] * n_threads; - } + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; + } - work_size = MAX(work_size, cur); - } break; - case WSP_GGML_OP_SUB: - case WSP_GGML_OP_DIV: - case WSP_GGML_OP_SQR: - case WSP_GGML_OP_SQRT: - case WSP_GGML_OP_LOG: - case WSP_GGML_OP_SUM: - case WSP_GGML_OP_SUM_ROWS: - case WSP_GGML_OP_MEAN: - case WSP_GGML_OP_ARGMAX: - case WSP_GGML_OP_REPEAT: - case WSP_GGML_OP_REPEAT_BACK: - case WSP_GGML_OP_ABS: - case WSP_GGML_OP_SGN: - case WSP_GGML_OP_NEG: - case WSP_GGML_OP_STEP: - case WSP_GGML_OP_TANH: - case WSP_GGML_OP_ELU: - case WSP_GGML_OP_RELU: - { - node->n_tasks = 1; - } break; - case WSP_GGML_OP_MUL: - case WSP_GGML_OP_GELU: - case WSP_GGML_OP_GELU_QUICK: - case WSP_GGML_OP_SILU: - case WSP_GGML_OP_SILU_BACK: - case WSP_GGML_OP_NORM: - case WSP_GGML_OP_RMS_NORM: - case WSP_GGML_OP_RMS_NORM_BACK: - { - node->n_tasks = n_threads; - } break; - case WSP_GGML_OP_MUL_MAT: - case WSP_GGML_OP_OUT_PROD: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ACC: + { + n_tasks = n_threads; - // TODO: use different scheduling for different matrix sizes - //const int nr0 = wsp_ggml_nrows(node->src0); - //const int nr1 = wsp_ggml_nrows(node->src1); + size_t cur = 0; - //node->n_tasks = MIN(n_threads, MAX(1, nr0/128)); - //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks); + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; + } - size_t cur = 0; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SUB: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + { + n_tasks = 1; + } break; + + case GGML_OP_UNARY: + { + switch (ggml_get_unary_op(node)) { + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_SGN: + case GGML_UNARY_OP_NEG: + case GGML_UNARY_OP_STEP: + case GGML_UNARY_OP_TANH: + case GGML_UNARY_OP_ELU: + case GGML_UNARY_OP_RELU: + { + n_tasks = 1; + } break; -#if defined(WSP_GGML_USE_CUBLAS) - if (wsp_ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } - else -#elif defined(WSP_GGML_USE_CLBLAST) - if (wsp_ggml_cl_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - cur = wsp_ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); - } - else -#endif - if (node->src0->type == WSP_GGML_TYPE_F16 && node->src1->type == WSP_GGML_TYPE_F32) { -#if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS) - if (wsp_ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - // here we need memory just for single 2D matrix from src0 - cur = WSP_GGML_TYPE_SIZE[WSP_GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else { - cur = WSP_GGML_TYPE_SIZE[WSP_GGML_TYPE_F16]*wsp_ggml_nelements(node->src1); - } -#else - cur = WSP_GGML_TYPE_SIZE[WSP_GGML_TYPE_F16]*wsp_ggml_nelements(node->src1); -#endif - } else if (node->src0->type == WSP_GGML_TYPE_F32 && node->src1->type == WSP_GGML_TYPE_F32) { - cur = 0; -#if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS) - if (wsp_ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; - } + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_SILU: + { + n_tasks = n_threads; + } break; + } + } break; + case GGML_OP_SILU_BACK: + case GGML_OP_MUL: + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: + case GGML_OP_GROUP_NORM: + { + n_tasks = n_threads; + } break; + case GGML_OP_CONCAT: + case GGML_OP_MUL_MAT: + case GGML_OP_OUT_PROD: + { + n_tasks = n_threads; + + // TODO: use different scheduling for different matrix sizes + //const int nr0 = ggml_nrows(node->src[0]); + //const int nr1 = ggml_nrows(node->src[1]); + + //n_tasks = MIN(n_threads, MAX(1, nr0/128)); + //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); + + size_t cur = 0; + const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; + +#if defined(GGML_USE_CUBLAS) + if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + } else +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node); + } else #endif - } else if (wsp_ggml_is_quantized(node->src0->type) && node->src1->type == WSP_GGML_TYPE_F32) { -#if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS) - if (wsp_ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; - cur = WSP_GGML_TYPE_SIZE[WSP_GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + if (node->src[0]->type != GGML_TYPE_F32) { + // here we need memory just for single 2D matrix from src0 + cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); + } + } else #endif - { - const enum wsp_ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type; - cur = WSP_GGML_TYPE_SIZE[type_q]*wsp_ggml_nelements(node->src1)/WSP_GGML_BLCK_SIZE[type_q]; - } - } else { - WSP_GGML_ASSERT(false); - } + if (node->src[1]->type != vec_dot_type) { + cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type); + } else { + cur = 0; + } - work_size = MAX(work_size, cur); - } break; - case WSP_GGML_OP_SCALE: - { - node->n_tasks = 1; - } break; - case WSP_GGML_OP_SET: - case WSP_GGML_OP_CONT: - case WSP_GGML_OP_RESHAPE: - case WSP_GGML_OP_VIEW: - case WSP_GGML_OP_PERMUTE: - case WSP_GGML_OP_TRANSPOSE: - case WSP_GGML_OP_GET_ROWS: - case WSP_GGML_OP_GET_ROWS_BACK: - case WSP_GGML_OP_DIAG: - case WSP_GGML_OP_DIAG_MASK_ZERO: - { - node->n_tasks = 1; - } break; - case WSP_GGML_OP_DIAG_MASK_INF: - case WSP_GGML_OP_SOFT_MAX: - case WSP_GGML_OP_SOFT_MAX_BACK: - case WSP_GGML_OP_ROPE: - case WSP_GGML_OP_ROPE_BACK: - { - node->n_tasks = n_threads; - } break; - case WSP_GGML_OP_ALIBI: - { - node->n_tasks = 1; //TODO - } break; - case WSP_GGML_OP_CLAMP: - { - node->n_tasks = 1; //TODO - } break; - case WSP_GGML_OP_CONV_1D: - { - node->n_tasks = n_threads; - - WSP_GGML_ASSERT(node->src0->ne[3] == 1); - WSP_GGML_ASSERT(node->src1->ne[2] == 1); - WSP_GGML_ASSERT(node->src1->ne[3] == 1); - - size_t cur = 0; - const int nk = node->src0->ne[0]; - - if (node->src0->type == WSP_GGML_TYPE_F16 && - node->src1->type == WSP_GGML_TYPE_F32) { - cur = sizeof(wsp_ggml_fp16_t)*( - nk*wsp_ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] - ); - } else if (node->src0->type == WSP_GGML_TYPE_F32 && - node->src1->type == WSP_GGML_TYPE_F32) { - cur = sizeof(float)*( - nk*wsp_ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] - ); - } else { - WSP_GGML_ASSERT(false); - } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SCALE: + { + n_tasks = 1; + } break; + case GGML_OP_SET: + case GGML_OP_CONT: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: + { + n_tasks = 1; + } break; + case GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + case GGML_OP_ADD_REL_POS: + { + n_tasks = n_threads; + } break; + case GGML_OP_ALIBI: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CLAMP: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CONV_1D: + { + n_tasks = n_threads; + + GGML_ASSERT(node->src[0]->ne[3] == 1); + GGML_ASSERT(node->src[1]->ne[2] == 1); + GGML_ASSERT(node->src[1]->ne[3] == 1); + + size_t cur = 0; + const int nk = node->src[0]->ne[0]; + + if (node->src[0]->type == GGML_TYPE_F16 && + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*( + nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] + + ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1] + ); + } else if (node->src[0]->type == GGML_TYPE_F32 && + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*( + nk*ggml_up32(node->src[0]->ne[1])*node->src[0]->ne[2] + + ( 2*(nk/2) + node->src[1]->ne[0])*node->src[1]->ne[1] + ); + } else { + GGML_ASSERT(false); + } - work_size = MAX(work_size, cur); - } break; - case WSP_GGML_OP_CONV_2D: - { - node->n_tasks = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CONV_2D: + { + n_tasks = n_threads; + + const int64_t ne00 = node->src[0]->ne[0]; // W + const int64_t ne01 = node->src[0]->ne[1]; // H + const int64_t ne02 = node->src[0]->ne[2]; // C + const int64_t ne03 = node->src[0]->ne[3]; // N + + const int64_t ne10 = node->src[1]->ne[0]; // W + const int64_t ne11 = node->src[1]->ne[1]; // H + const int64_t ne12 = node->src[1]->ne[2]; // C + + const int64_t ne0 = node->ne[0]; + const int64_t ne1 = node->ne[1]; + const int64_t ne2 = node->ne[2]; + const int64_t nk = ne00*ne01; + const int64_t ew0 = nk * ne02; + + UNUSED(ne03); + UNUSED(ne2); + + size_t cur = 0; + + if (node->src[0]->type == GGML_TYPE_F16 && + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0); + } else if (node->src[0]->type == GGML_TYPE_F32 && + node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)* (ne10*ne11*ne12); + } else { + GGML_ASSERT(false); + } - WSP_GGML_ASSERT(node->src1->ne[3] == 1); + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CONV_TRANSPOSE_2D: + { + n_tasks = n_threads; - const int64_t ne00 = node->src0->ne[0]; // W - const int64_t ne01 = node->src0->ne[1]; // H - const int64_t ne02 = node->src0->ne[2]; // C - const int64_t ne03 = node->src0->ne[3]; // N + const int64_t ne00 = node->src[0]->ne[0]; // W + const int64_t ne01 = node->src[0]->ne[1]; // H + const int64_t ne02 = node->src[0]->ne[2]; // Channels Out + const int64_t ne03 = node->src[0]->ne[3]; // Channels In - const int64_t ne10 = node->src1->ne[0]; // W - const int64_t ne11 = node->src1->ne[1]; // H - const int64_t ne12 = node->src1->ne[2]; // C + const int64_t ne10 = node->src[1]->ne[0]; // W + const int64_t ne11 = node->src[1]->ne[1]; // H + const int64_t ne12 = node->src[1]->ne[2]; // Channels In - const int64_t nk = ne00*ne01; + size_t cur = 0; + cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03; + cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12; - UNUSED(ne02); - UNUSED(ne03); - UNUSED(nk); + work_size = MAX(work_size, cur); + } break; + case GGML_OP_POOL_1D: + case GGML_OP_POOL_2D: + { + n_tasks = 1; + } break; + case GGML_OP_UPSCALE: + { + n_tasks = n_threads; + } break; + case GGML_OP_FLASH_ATTN: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - if (node->src0->type == WSP_GGML_TYPE_F16 && - node->src1->type == WSP_GGML_TYPE_F32) { - cur = sizeof(wsp_ggml_fp16_t)*(ne10*ne11*ne12); - } else if (node->src0->type == WSP_GGML_TYPE_F32 && - node->src1->type == WSP_GGML_TYPE_F32) { - cur = sizeof(float)* (ne10*ne11*ne12); - } else { - WSP_GGML_ASSERT(false); - } + const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); - work_size = MAX(work_size, cur); - } break; - case WSP_GGML_OP_FLASH_ATTN: - { - node->n_tasks = n_threads; + if (node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } - size_t cur = 0; + if (node->src[1]->type == GGML_TYPE_F16) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } - const int64_t ne11 = wsp_ggml_up(node->src1->ne[1], WSP_GGML_SOFT_MAX_UNROLL); + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_FF: + { + n_tasks = n_threads; - if (node->src1->type == WSP_GGML_TYPE_F32) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 - } + size_t cur = 0; - if (node->src1->type == WSP_GGML_TYPE_F16) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 - } + if (node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 + } - work_size = MAX(work_size, cur); - } break; - case WSP_GGML_OP_FLASH_FF: - { - node->n_tasks = n_threads; + if (node->src[1]->type == GGML_TYPE_F16) { + cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 + } - size_t cur = 0; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + n_tasks = n_threads; - if (node->src1->type == WSP_GGML_TYPE_F32) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 - } + size_t cur = 0; - if (node->src1->type == WSP_GGML_TYPE_F16) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 - } + const int64_t D = node->src[0]->ne[0]; + const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back + if (node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } - work_size = MAX(work_size, cur); - } break; - case WSP_GGML_OP_FLASH_ATTN_BACK: - { - node->n_tasks = n_threads; + if (node->src[1]->type == GGML_TYPE_F16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: + case GGML_OP_GET_REL_POS: + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1_F32: + case GGML_OP_MAP_CUSTOM2_F32: + case GGML_OP_MAP_CUSTOM3_F32: + { + n_tasks = 1; + } break; + case GGML_OP_MAP_CUSTOM1: + { + struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; + case GGML_OP_MAP_CUSTOM2: + { + struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; + case GGML_OP_MAP_CUSTOM3: + { + struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params; + if (p->n_tasks == GGML_N_TASKS_MAX) { + n_tasks = n_threads; + } else { + n_tasks = MIN(p->n_tasks, n_threads); + } + } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); - const int64_t D = node->src0->ne[0]; - const int64_t ne11 = wsp_ggml_up(node->src1->ne[1], WSP_GGML_SOFT_MAX_UNROLL); - const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in wsp_ggml_compute_forward_flash_attn_back - if (node->src1->type == WSP_GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 - } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + n_tasks = n_threads; + } break; + case GGML_OP_NONE: + { + n_tasks = 1; + } break; + case GGML_OP_COUNT: + { + GGML_ASSERT(false); + } break; + } - if (node->src1->type == WSP_GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 - } + cplan.n_tasks[i] = n_tasks; + } - work_size = MAX(work_size, cur); - } break; - case WSP_GGML_OP_WIN_PART: - case WSP_GGML_OP_WIN_UNPART: - case WSP_GGML_OP_MAP_UNARY: - case WSP_GGML_OP_MAP_BINARY: - case WSP_GGML_OP_MAP_CUSTOM1: - case WSP_GGML_OP_MAP_CUSTOM2: - case WSP_GGML_OP_MAP_CUSTOM3: - { - node->n_tasks = 1; - } break; - case WSP_GGML_OP_CROSS_ENTROPY_LOSS: - { - node->n_tasks = n_threads; + if (work_size > 0) { + work_size += CACHE_LINE_SIZE*(n_threads - 1); + } - size_t cur = wsp_ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks); + cplan.n_threads = n_threads; + cplan.work_size = work_size; + cplan.work_data = NULL; - work_size = MAX(work_size, cur); - } break; - case WSP_GGML_OP_CROSS_ENTROPY_LOSS_BACK: - { - node->n_tasks = n_threads; + return cplan; +} - size_t cur = wsp_ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks; +int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { + { + GGML_ASSERT(cplan); + GGML_ASSERT(cplan->n_threads > 0); - work_size = MAX(work_size, cur); - } break; - case WSP_GGML_OP_NONE: - { - node->n_tasks = 1; - } break; - case WSP_GGML_OP_COUNT: - { - WSP_GGML_ASSERT(false); - } break; - } + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); } - if (cgraph->work != NULL && work_size > cgraph->work_size) { - WSP_GGML_ASSERT(false); // TODO: better handling + for (int i = 0; i < cgraph->n_nodes; ++i) { + if (cgraph->nodes[i]->op != GGML_OP_NONE) { + GGML_ASSERT(cplan->n_tasks[i] > 0); + } } + } - if (work_size > 0 && cgraph->work == NULL) { - cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1); + const int n_threads = cplan->n_threads; - WSP_GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); - cgraph->work = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I8, cgraph->work_size); - } - } + struct ggml_compute_state_shared state_shared = { + /*.cgraph =*/ cgraph, + /*.cgraph_plan =*/ cplan, + /*.perf_node_start_cycles =*/ 0, + /*.perf_node_start_time_us =*/ 0, + /*.n_threads =*/ n_threads, + /*.n_active =*/ n_threads, + /*.node_n =*/ -1, + /*.abort_callback =*/ NULL, + /*.abort_callback_data =*/ NULL, + }; + struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); // create thread pool if (n_threads > 1) { for (int j = 1; j < n_threads; ++j) { - workers[j] = (struct wsp_ggml_compute_state) { + workers[j] = (struct ggml_compute_state) { .thrd = 0, .ith = j, .shared = &state_shared, }; - const int rc = wsp_ggml_thread_create(&workers[j].thrd, NULL, wsp_ggml_graph_compute_thread, &workers[j]); - WSP_GGML_ASSERT(rc == 0); + const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); + GGML_ASSERT(rc == 0); + UNUSED(rc); } } + workers[0].ith = 0; workers[0].shared = &state_shared; - const int64_t perf_start_cycles = wsp_ggml_perf_cycles(); - const int64_t perf_start_time_us = wsp_ggml_perf_time_us(); + const int64_t perf_start_cycles = ggml_perf_cycles(); + const int64_t perf_start_time_us = ggml_perf_time_us(); // this is a work thread too - wsp_ggml_graph_compute_thread(&workers[0]); + int compute_status = (size_t) ggml_graph_compute_thread(&workers[0]); // don't leave affinity set on the main thread clear_numa_thread_affinity(); - // join thread pool + // join or kill thread pool if (n_threads > 1) { for (int j = 1; j < n_threads; j++) { - const int rc = wsp_ggml_thread_join(workers[j].thrd, NULL); - WSP_GGML_ASSERT(rc == 0); + const int rc = ggml_thread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == 0); } } // performance stats (graph) { - int64_t perf_cycles_cur = wsp_ggml_perf_cycles() - perf_start_cycles; - int64_t perf_time_us_cur = wsp_ggml_perf_time_us() - perf_start_time_us; + int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; + int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us; cgraph->perf_runs++; cgraph->perf_cycles += perf_cycles_cur; cgraph->perf_time_us += perf_time_us_cur; - WSP_GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n", + GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n", __func__, cgraph->perf_runs, - (double) perf_cycles_cur / (double) wsp_ggml_cycles_per_ms(), - (double) cgraph->perf_cycles / (double) wsp_ggml_cycles_per_ms() / (double) cgraph->perf_runs, + (double) perf_cycles_cur / (double) ggml_cycles_per_ms(), + (double) cgraph->perf_cycles / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs, (double) perf_time_us_cur / 1000.0, (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs); } + + return compute_status; } -void wsp_ggml_graph_reset(struct wsp_ggml_cgraph * cgraph) { +void ggml_graph_reset(struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { - struct wsp_ggml_tensor * grad = cgraph->grads[i]; + struct ggml_tensor * grad = cgraph->grads[i]; if (grad) { - wsp_ggml_set_zero(grad); + ggml_set_zero(grad); } } } -struct wsp_ggml_tensor * wsp_ggml_graph_get_tensor(struct wsp_ggml_cgraph * cgraph, const char * name) { +void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { + struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); + + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + + cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; + + ggml_graph_compute(cgraph, &cplan); +} + +struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { for (int i = 0; i < cgraph->n_leafs; i++) { - struct wsp_ggml_tensor * leaf = cgraph->leafs[i]; + struct ggml_tensor * leaf = cgraph->leafs[i]; if (strcmp(leaf->name, name) == 0) { return leaf; @@ -16857,7 +17853,7 @@ struct wsp_ggml_tensor * wsp_ggml_graph_get_tensor(struct wsp_ggml_cgraph * cgra } for (int i = 0; i < cgraph->n_nodes; i++) { - struct wsp_ggml_tensor * node = cgraph->nodes[i]; + struct ggml_tensor * node = cgraph->nodes[i]; if (strcmp(node->name, name) == 0) { return node; @@ -16867,13 +17863,13 @@ struct wsp_ggml_tensor * wsp_ggml_graph_get_tensor(struct wsp_ggml_cgraph * cgra return NULL; } -static void wsp_ggml_graph_export_leaf(const struct wsp_ggml_tensor * tensor, FILE * fout) { +static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) { const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", - wsp_ggml_type_name(tensor->type), - wsp_ggml_op_name (tensor->op), + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), tensor->n_dims, ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], @@ -16881,32 +17877,28 @@ static void wsp_ggml_graph_export_leaf(const struct wsp_ggml_tensor * tensor, FI tensor->name); } -static void wsp_ggml_graph_export_node(const struct wsp_ggml_tensor * tensor, const char * arg, FILE * fout) { +static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) { const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n", + fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", arg, - wsp_ggml_type_name(tensor->type), - wsp_ggml_op_name (tensor->op), + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), tensor->n_dims, ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], - tensor->n_tasks, tensor->data, tensor->name); } -void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * fname) { - //assert(cgraph->work == NULL); - //assert(cgraph->work_size == 0); - +void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { uint64_t size_eval = 0; // compute size of intermediate results // TODO: does not take into account scratch buffers !!!! for (int i = 0; i < cgraph->n_nodes; ++i) { - size_eval += wsp_ggml_nbytes(cgraph->nodes[i]); + size_eval += ggml_nbytes_pad(cgraph->nodes[i]); } // print @@ -16914,8 +17906,8 @@ void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * f FILE * fout = stdout; fprintf(fout, "\n"); - fprintf(fout, "%-16s %8x\n", "magic", WSP_GGML_FILE_MAGIC); - fprintf(fout, "%-16s %8d\n", "version", WSP_GGML_FILE_VERSION); + fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); + fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); fprintf(fout, "%-16s %" PRIu64 "\n", "eval", size_eval); @@ -16926,11 +17918,11 @@ void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * f "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME"); for (int i = 0; i < cgraph->n_leafs; ++i) { - wsp_ggml_graph_export_leaf(cgraph->leafs[i], fout); + ggml_graph_export_leaf(cgraph->leafs[i], fout); - WSP_GGML_ASSERT(cgraph->leafs[i]->op == WSP_GGML_OP_NONE); - WSP_GGML_ASSERT(cgraph->leafs[i]->src0 == NULL); - WSP_GGML_ASSERT(cgraph->leafs[i]->src1 == NULL); + GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); + GGML_ASSERT(cgraph->leafs[i]->src[0] == NULL); + GGML_ASSERT(cgraph->leafs[i]->src[1] == NULL); } // header @@ -16939,19 +17931,11 @@ void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * f "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME"); for (int i = 0; i < cgraph->n_nodes; ++i) { - wsp_ggml_graph_export_node(cgraph->nodes[i], "DST", fout); - - if (cgraph->nodes[i]->src0) { - wsp_ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout); - } + ggml_graph_export_node(cgraph->nodes[i], "DST", fout); - if (cgraph->nodes[i]->src1) { - wsp_ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout); - } - - for (int j = 0; j < WSP_GGML_MAX_OPT; ++j) { - if (cgraph->nodes[i]->opt[j]) { - wsp_ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout); + for (int j = 0; j < GGML_MAX_SRC; ++j) { + if (cgraph->nodes[i]->src[j]) { + ggml_graph_export_node(cgraph->nodes[i]->src[j], "SRC", fout); } } @@ -16972,8 +17956,8 @@ void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * f // header { - const uint32_t magic = WSP_GGML_FILE_MAGIC; - const uint32_t version = WSP_GGML_FILE_VERSION; + const uint32_t magic = GGML_FILE_MAGIC; + const uint32_t version = GGML_FILE_VERSION; const uint32_t n_leafs = cgraph->n_leafs; const uint32_t nodes = cgraph->n_nodes; @@ -16987,7 +17971,7 @@ void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * f // leafs { for (int i = 0; i < cgraph->n_leafs; ++i) { - const struct wsp_ggml_tensor * tensor = cgraph->leafs[i]; + const struct ggml_tensor * tensor = cgraph->leafs[i]; const uint32_t type = tensor->type; const uint32_t op = tensor->op; @@ -16997,7 +17981,7 @@ void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * f fwrite(&op, sizeof(uint32_t), 1, fout); fwrite(&n_dims, sizeof(uint32_t), 1, fout); - for (int j = 0; j < WSP_GGML_MAX_DIMS; ++j) { + for (int j = 0; j < GGML_MAX_DIMS; ++j) { const uint64_t ne = tensor->ne[j]; const uint64_t nb = tensor->nb[j]; @@ -17005,12 +17989,13 @@ void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * f fwrite(&nb, sizeof(uint64_t), 1, fout); } - fwrite(tensor->name, sizeof(char), WSP_GGML_MAX_NAME, fout); + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout); // dump the data // TODO: pad this to 32 byte boundary { - const size_t size = wsp_ggml_nbytes(tensor); + const size_t size = ggml_nbytes(tensor); fwrite(tensor->data, sizeof(char), size, fout); } @@ -17020,7 +18005,7 @@ void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * f // nodes { for (int i = 0; i < cgraph->n_nodes; ++i) { - const struct wsp_ggml_tensor * tensor = cgraph->nodes[i]; + const struct ggml_tensor * tensor = cgraph->nodes[i]; const uint32_t type = tensor->type; const uint32_t op = tensor->op; @@ -17030,7 +18015,7 @@ void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * f fwrite(&op, sizeof(uint32_t), 1, fout); fwrite(&n_dims, sizeof(uint32_t), 1, fout); - for (int j = 0; j < WSP_GGML_MAX_DIMS; ++j) { + for (int j = 0; j < GGML_MAX_DIMS; ++j) { const uint64_t ne = tensor->ne[j]; const uint64_t nb = tensor->nb[j]; @@ -17038,20 +18023,18 @@ void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * f fwrite(&nb, sizeof(uint64_t), 1, fout); } - fwrite(tensor->name, sizeof(char), WSP_GGML_MAX_NAME, fout); + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + fwrite(tensor->op_params, sizeof(char), GGML_MAX_OP_PARAMS, fout); // output the op arguments { - struct wsp_ggml_tensor * args[2 + WSP_GGML_MAX_OPT] = { NULL }; + struct ggml_tensor * args[GGML_MAX_SRC] = { NULL }; - args[0] = tensor->src0; - args[1] = tensor->src1; - - for (int j = 0; j < WSP_GGML_MAX_OPT; ++j) { - args[2 + j] = tensor->opt[j]; + for (int j = 0; j < GGML_MAX_SRC; ++j) { + args[j] = tensor->src[j]; } - for (int j = 0; j < 2 + WSP_GGML_MAX_OPT; ++j) { + for (int j = 0; j < GGML_MAX_SRC; ++j) { if (args[j]) { int32_t idx = -1; @@ -17069,7 +18052,7 @@ void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * f if (idx == -1) { for (int k = 0; k < cgraph->n_nodes; ++k) { if (args[j] == cgraph->nodes[k]) { - idx = WSP_GGML_MAX_NODES + k; + idx = GGML_MAX_NODES + k; break; } } @@ -17095,13 +18078,13 @@ void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * f } } -struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml_context ** ctx_data, struct wsp_ggml_context ** ctx_eval) { +struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) { assert(*ctx_data == NULL); assert(*ctx_eval == NULL); - struct wsp_ggml_cgraph result = { 0 }; + struct ggml_cgraph result = { 0 }; - struct wsp_ggml_tensor * data = NULL; + struct ggml_tensor * data = NULL; // read file into data { @@ -17119,15 +18102,15 @@ struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml // create the data context { - const size_t overhead = 1*wsp_ggml_tensor_overhead(); + const size_t overhead = 1*ggml_tensor_overhead(); - struct wsp_ggml_init_params params = { + struct ggml_init_params params = { .mem_size = fsize + overhead, .mem_buffer = NULL, .no_alloc = false, }; - *ctx_data = wsp_ggml_init(params); + *ctx_data = ggml_init(params); if (!*ctx_data) { fprintf(stderr, "%s: failed to create ggml context\n", __func__); @@ -17136,7 +18119,7 @@ struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml } } - data = wsp_ggml_new_tensor_1d(*ctx_data, WSP_GGML_TYPE_I8, fsize); + data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize); { const size_t ret = fread(data->data, sizeof(char), fsize, fin); @@ -17156,14 +18139,14 @@ struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic); - if (magic != WSP_GGML_FILE_MAGIC) { + if (magic != GGML_FILE_MAGIC) { fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic); return result; } const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version); - if (version != WSP_GGML_FILE_VERSION) { + if (version != GGML_FILE_VERSION) { fprintf(stderr, "%s: invalid version number\n", __func__); return result; } @@ -17177,15 +18160,15 @@ struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml // create the data context { - const size_t overhead = (n_leafs + n_nodes)*wsp_ggml_tensor_overhead(); + const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead(); - struct wsp_ggml_init_params params = { + struct ggml_init_params params = { .mem_size = size_eval + overhead, .mem_buffer = NULL, .no_alloc = true, }; - *ctx_eval = wsp_ggml_init(params); + *ctx_eval = ggml_init(params); if (!*ctx_eval) { fprintf(stderr, "%s: failed to create ggml context\n", __func__); @@ -17204,10 +18187,10 @@ struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml op = *(const uint32_t *) ptr; ptr += sizeof(op); n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); - int64_t ne[WSP_GGML_MAX_DIMS]; - size_t nb[WSP_GGML_MAX_DIMS]; + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; - for (int j = 0; j < WSP_GGML_MAX_DIMS; ++j) { + for (int j = 0; j < GGML_MAX_DIMS; ++j) { uint64_t ne_cur; uint64_t nb_cur; @@ -17218,27 +18201,28 @@ struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml nb[j] = nb_cur; } - struct wsp_ggml_tensor * tensor = wsp_ggml_new_tensor(*ctx_eval, (enum wsp_ggml_type) type, n_dims, ne); + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); - tensor->op = (enum wsp_ggml_op) op; + tensor->op = (enum ggml_op) op; - memcpy(tensor->name, ptr, WSP_GGML_MAX_NAME); ptr += WSP_GGML_MAX_NAME; + memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; + memcpy(tensor->op_params, ptr, GGML_MAX_OP_PARAMS); ptr += GGML_MAX_OP_PARAMS; tensor->data = (void *) ptr; - for (int j = 0; j < WSP_GGML_MAX_DIMS; ++j) { + for (int j = 0; j < GGML_MAX_DIMS; ++j) { tensor->nb[j] = nb[j]; } result.leafs[i] = tensor; - ptr += wsp_ggml_nbytes(tensor); + ptr += ggml_nbytes(tensor); - fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, wsp_ggml_nbytes(tensor)); + fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); } } - wsp_ggml_set_no_alloc(*ctx_eval, false); + ggml_set_no_alloc(*ctx_eval, false); // nodes { @@ -17251,12 +18235,12 @@ struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml op = *(const uint32_t *) ptr; ptr += sizeof(op); n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); - enum wsp_ggml_op eop = (enum wsp_ggml_op) op; + enum ggml_op eop = (enum ggml_op) op; - int64_t ne[WSP_GGML_MAX_DIMS]; - size_t nb[WSP_GGML_MAX_DIMS]; + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; - for (int j = 0; j < WSP_GGML_MAX_DIMS; ++j) { + for (int j = 0; j < GGML_MAX_DIMS; ++j) { uint64_t ne_cur; uint64_t nb_cur; @@ -17267,24 +18251,25 @@ struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml nb[j] = nb_cur; } - const char * ptr_name = ptr; ptr += WSP_GGML_MAX_NAME; + const char * ptr_name = ptr; ptr += GGML_MAX_NAME; + const char * ptr_op_params = ptr; ptr += GGML_MAX_OP_PARAMS; - const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += (2 + WSP_GGML_MAX_OPT)*sizeof(int32_t); + const int32_t * ptr_arg_idx = (const int32_t *) ptr; ptr += GGML_MAX_SRC*sizeof(int32_t); - struct wsp_ggml_tensor * args[2 + WSP_GGML_MAX_OPT] = { NULL }; + struct ggml_tensor * args[GGML_MAX_SRC] = { NULL }; // parse args - for (int j = 0; j < 2 + WSP_GGML_MAX_OPT; ++j) { + for (int j = 0; j < GGML_MAX_SRC; ++j) { const int32_t arg_idx = ptr_arg_idx[j]; if (arg_idx == -1) { continue; } - if (arg_idx < WSP_GGML_MAX_NODES) { + if (arg_idx < GGML_MAX_NODES) { args[j] = result.leafs[arg_idx]; } else { - args[j] = result.nodes[arg_idx - WSP_GGML_MAX_NODES]; + args[j] = result.nodes[arg_idx - GGML_MAX_NODES]; } } @@ -17292,55 +18277,53 @@ struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml // "view" operations are handled differently // TODO: handle inplace ops - currently a copy is always made - struct wsp_ggml_tensor * tensor = NULL; + struct ggml_tensor * tensor = NULL; switch (eop) { // TODO: implement other view ops - case WSP_GGML_OP_RESHAPE: + case GGML_OP_RESHAPE: { - tensor = wsp_ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]); + tensor = ggml_reshape_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3]); } break; - case WSP_GGML_OP_VIEW: + case GGML_OP_VIEW: { - tensor = wsp_ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0); + tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0); - uint64_t offs; - memcpy(&offs, args[2]->data, sizeof(offs)); + size_t offs; + memcpy(&offs, ptr_op_params, sizeof(offs)); tensor->data = ((char *) tensor->data) + offs; } break; - case WSP_GGML_OP_TRANSPOSE: + case GGML_OP_TRANSPOSE: { - tensor = wsp_ggml_transpose(*ctx_eval, args[0]); + tensor = ggml_transpose(*ctx_eval, args[0]); } break; - case WSP_GGML_OP_PERMUTE: + case GGML_OP_PERMUTE: { - tensor = wsp_ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0); + tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0); } break; default: { - tensor = wsp_ggml_new_tensor(*ctx_eval, (enum wsp_ggml_type) type, n_dims, ne); + tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); tensor->op = eop; } break; } - memcpy(tensor->name, ptr_name, WSP_GGML_MAX_NAME); + memcpy(tensor->name, ptr_name, GGML_MAX_NAME); + memcpy(tensor->op_params, ptr_op_params, GGML_MAX_OP_PARAMS); - for (int j = 0; j < WSP_GGML_MAX_DIMS; ++j) { + for (int j = 0; j < GGML_MAX_DIMS; ++j) { tensor->nb[j] = nb[j]; } - tensor->src0 = args[0]; - tensor->src1 = args[1]; - - for (int j = 0; j < WSP_GGML_MAX_OPT; ++j) { - tensor->opt[j] = args[2 + j]; + for (int j = 0; j < GGML_MAX_SRC; ++j) { + tensor->src[j] = args[j]; } result.nodes[i] = tensor; - fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, wsp_ggml_nbytes(tensor)); + fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); } } } @@ -17348,53 +18331,51 @@ struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml return result; } -void wsp_ggml_graph_print(const struct wsp_ggml_cgraph * cgraph) { - int64_t perf_total_per_op_us[WSP_GGML_OP_COUNT] = {0}; +void ggml_graph_print(const struct ggml_cgraph * cgraph) { + int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0}; - WSP_GGML_PRINT("=== GRAPH ===\n"); + GGML_PRINT("=== GRAPH ===\n"); - WSP_GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); - WSP_GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size); - - WSP_GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); + GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { - struct wsp_ggml_tensor * node = cgraph->nodes[i]; + struct ggml_tensor * node = cgraph->nodes[i]; perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us); - WSP_GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", + GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", i, node->ne[0], node->ne[1], node->ne[2], - WSP_GGML_OP_NAME[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, - (double) node->perf_cycles / (double) wsp_ggml_cycles_per_ms(), - (double) node->perf_cycles / (double) wsp_ggml_cycles_per_ms() / (double) node->perf_runs, + ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, + (double) node->perf_cycles / (double) ggml_cycles_per_ms(), + (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs, (double) node->perf_time_us / 1000.0, (double) node->perf_time_us / 1000.0 / node->perf_runs); } - WSP_GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs); + GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs); for (int i = 0; i < cgraph->n_leafs; i++) { - struct wsp_ggml_tensor * node = cgraph->leafs[i]; + struct ggml_tensor * node = cgraph->leafs[i]; - WSP_GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n", + GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n", i, node->ne[0], node->ne[1], - WSP_GGML_OP_NAME[node->op]); + ggml_op_name(node->op), + ggml_get_name(node)); } - for (int i = 0; i < WSP_GGML_OP_COUNT; i++) { + for (int i = 0; i < GGML_OP_COUNT; i++) { if (perf_total_per_op_us[i] == 0) { continue; } - WSP_GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", WSP_GGML_OP_NAME[i], (double) perf_total_per_op_us[i] / 1000.0); + GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0); } - WSP_GGML_PRINT("========================================\n"); + GGML_PRINT("========================================\n"); } // check if node is part of the graph -static bool wsp_ggml_graph_find(const struct wsp_ggml_cgraph * cgraph, const struct wsp_ggml_tensor * node) { +static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) { if (cgraph == NULL) { return true; } @@ -17408,9 +18389,9 @@ static bool wsp_ggml_graph_find(const struct wsp_ggml_cgraph * cgraph, const str return false; } -static struct wsp_ggml_tensor * wsp_ggml_graph_get_parent(const struct wsp_ggml_cgraph * cgraph, const struct wsp_ggml_tensor * node) { +static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) { for (int i = 0; i < cgraph->n_nodes; i++) { - struct wsp_ggml_tensor * parent = cgraph->nodes[i]; + struct ggml_tensor * parent = cgraph->nodes[i]; if (parent->grad == node) { return parent; @@ -17420,9 +18401,9 @@ static struct wsp_ggml_tensor * wsp_ggml_graph_get_parent(const struct wsp_ggml_ return NULL; } -static void wsp_ggml_graph_dump_dot_node_edge(FILE * fp, const struct wsp_ggml_cgraph * gb, struct wsp_ggml_tensor * node, struct wsp_ggml_tensor * parent, const char * label) { - struct wsp_ggml_tensor * gparent = wsp_ggml_graph_get_parent(gb, node); - struct wsp_ggml_tensor * gparent0 = wsp_ggml_graph_get_parent(gb, parent); +static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { + struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node); + struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent); fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n", gparent0 ? (void *) gparent0 : (void *) parent, gparent0 ? "g" : "x", @@ -17433,34 +18414,34 @@ static void wsp_ggml_graph_dump_dot_node_edge(FILE * fp, const struct wsp_ggml_c label); } -static void wsp_ggml_graph_dump_dot_leaf_edge(FILE * fp, struct wsp_ggml_tensor * node, struct wsp_ggml_tensor * parent, const char * label) { +static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n", (void *) parent, "x", (void *) node, "x", label); } -void wsp_ggml_graph_dump_dot(const struct wsp_ggml_cgraph * gb, const struct wsp_ggml_cgraph * gf, const char * filename) { +void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) { char color[16]; FILE * fp = fopen(filename, "w"); - WSP_GGML_ASSERT(fp); + GGML_ASSERT(fp); fprintf(fp, "digraph G {\n"); fprintf(fp, " newrank = true;\n"); fprintf(fp, " rankdir = LR;\n"); for (int i = 0; i < gb->n_nodes; i++) { - struct wsp_ggml_tensor * node = gb->nodes[i]; + struct ggml_tensor * node = gb->nodes[i]; - if (wsp_ggml_graph_get_parent(gb, node) != NULL) { + if (ggml_graph_get_parent(gb, node) != NULL) { continue; } if (node->is_param) { snprintf(color, sizeof(color), "yellow"); } else if (node->grad) { - if (wsp_ggml_graph_find(gf, node)) { + if (ggml_graph_find(gf, node)) { snprintf(color, sizeof(color), "green"); } else { snprintf(color, sizeof(color), "lightblue"); @@ -17475,26 +18456,26 @@ void wsp_ggml_graph_dump_dot(const struct wsp_ggml_cgraph * gb, const struct wsp (void *) node, color); if (strlen(node->name) > 0) { - fprintf(fp, "%s (%s)|", node->name, wsp_ggml_type_name(node->type)); + fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type)); } else { - fprintf(fp, "(%s)|", wsp_ggml_type_name(node->type)); + fprintf(fp, "(%s)|", ggml_type_name(node->type)); } if (node->n_dims == 2) { - fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], WSP_GGML_OP_SYMBOL[node->op]); + fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op)); } else { - fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], node->ne[2], WSP_GGML_OP_SYMBOL[node->op]); + fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op)); } if (node->grad) { - fprintf(fp, " | %s\"; ]\n", WSP_GGML_OP_SYMBOL[node->grad->op]); + fprintf(fp, " | %s\"; ]\n", ggml_op_symbol(node->grad->op)); } else { fprintf(fp, "\"; ]\n"); } } for (int i = 0; i < gb->n_leafs; i++) { - struct wsp_ggml_tensor * node = gb->leafs[i]; + struct ggml_tensor * node = gb->leafs[i]; snprintf(color, sizeof(color), "pink"); @@ -17504,25 +18485,25 @@ void wsp_ggml_graph_dump_dot(const struct wsp_ggml_cgraph * gb, const struct wsp (void *) node, color); if (strlen(node->name) > 0) { - fprintf(fp, "%s (%s)|", node->name, wsp_ggml_type_name(node->type)); + fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type)); } else { - fprintf(fp, "(%s)|", wsp_ggml_type_name(node->type)); + fprintf(fp, "(%s)|", ggml_type_name(node->type)); } fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); - if (wsp_ggml_nelements(node) < 5) { + if (ggml_nelements(node) < 5) { fprintf(fp, " | ("); - for (int j = 0; j < wsp_ggml_nelements(node); j++) { - if (node->type == WSP_GGML_TYPE_I8 || node->type == WSP_GGML_TYPE_I16 || node->type == WSP_GGML_TYPE_I32) { - fprintf(fp, "%d", wsp_ggml_get_i32_1d(node, j)); + for (int j = 0; j < ggml_nelements(node); j++) { + if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { + fprintf(fp, "%d", ggml_get_i32_1d(node, j)); } - else if (node->type == WSP_GGML_TYPE_F32 || node->type == WSP_GGML_TYPE_F16) { - fprintf(fp, "%.1e", (double)wsp_ggml_get_f32_1d(node, j)); + else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) { + fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j)); } else { fprintf(fp, "#"); } - if (j < wsp_ggml_nelements(node) - 1) { + if (j < ggml_nelements(node) - 1) { fprintf(fp, ", "); } } @@ -17532,41 +18513,25 @@ void wsp_ggml_graph_dump_dot(const struct wsp_ggml_cgraph * gb, const struct wsp } for (int i = 0; i < gb->n_nodes; i++) { - struct wsp_ggml_tensor * node = gb->nodes[i]; - - if (node->src0) { - wsp_ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x"); - } - - if (node->src1) { - wsp_ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y"); - } + struct ggml_tensor * node = gb->nodes[i]; - for (int j = 0; j < WSP_GGML_MAX_OPT; j++) { - if (node->opt[j]) { + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j]) { char label[16]; - snprintf(label, sizeof(label), "opt %d", j); - wsp_ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label); + snprintf(label, sizeof(label), "src %d", j); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label); } } } for (int i = 0; i < gb->n_leafs; i++) { - struct wsp_ggml_tensor * node = gb->leafs[i]; - - if (node->src0) { - wsp_ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x"); - } - - if (node->src1) { - wsp_ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y"); - } + struct ggml_tensor * node = gb->leafs[i]; - for (int j = 0; j < WSP_GGML_MAX_OPT; j++) { - if (node->opt[j]) { + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j]) { char label[16]; - snprintf(label, sizeof(label), "opt %d", j); - wsp_ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label); + snprintf(label, sizeof(label), "src %d", j); + ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label); } } } @@ -17575,40 +18540,40 @@ void wsp_ggml_graph_dump_dot(const struct wsp_ggml_cgraph * gb, const struct wsp fclose(fp); - WSP_GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename); + GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename); } //////////////////////////////////////////////////////////////////////////////// -static void wsp_ggml_opt_set_params(int np, struct wsp_ggml_tensor * const ps[], const float * x) { +static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) { int i = 0; for (int p = 0; p < np; ++p) { - const int64_t ne = wsp_ggml_nelements(ps[p]) ; + const int64_t ne = ggml_nelements(ps[p]) ; // TODO: add function to set tensor from array for (int64_t j = 0; j < ne; ++j) { - wsp_ggml_set_f32_1d(ps[p], j, x[i++]); + ggml_set_f32_1d(ps[p], j, x[i++]); } } } -static void wsp_ggml_opt_get_params(int np, struct wsp_ggml_tensor * const ps[], float * x) { +static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) { int i = 0; for (int p = 0; p < np; ++p) { - const int64_t ne = wsp_ggml_nelements(ps[p]) ; + const int64_t ne = ggml_nelements(ps[p]) ; // TODO: add function to get all elements at once for (int64_t j = 0; j < ne; ++j) { - x[i++] = wsp_ggml_get_f32_1d(ps[p], j); + x[i++] = ggml_get_f32_1d(ps[p], j); } } } -static void wsp_ggml_opt_get_grad(int np, struct wsp_ggml_tensor * const ps[], float * g) { +static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) { int i = 0; for (int p = 0; p < np; ++p) { - const int64_t ne = wsp_ggml_nelements(ps[p]) ; + const int64_t ne = ggml_nelements(ps[p]) ; // TODO: add function to get all elements at once for (int64_t j = 0; j < ne; ++j) { - g[i++] = wsp_ggml_get_f32_1d(ps[p]->grad, j); + g[i++] = ggml_get_f32_1d(ps[p]->grad, j); } } } @@ -17619,72 +18584,76 @@ static void wsp_ggml_opt_get_grad(int np, struct wsp_ggml_tensor * const ps[], f // ref: https://arxiv.org/pdf/1412.6980.pdf // -static enum wsp_ggml_opt_result wsp_ggml_opt_adam( - struct wsp_ggml_context * ctx, - struct wsp_ggml_opt_context * opt, - struct wsp_ggml_opt_params params, - struct wsp_ggml_tensor * f, - struct wsp_ggml_cgraph * gf, - struct wsp_ggml_cgraph * gb) { - WSP_GGML_ASSERT(wsp_ggml_is_scalar(f)); - - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; +static enum ggml_opt_result ggml_opt_adam( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data) { + GGML_ASSERT(ggml_is_scalar(f)); // these will store the parameters we want to optimize - struct wsp_ggml_tensor * ps[WSP_GGML_MAX_PARAMS]; + struct ggml_tensor * ps[GGML_MAX_PARAMS]; int np = 0; - int nx = 0; + int64_t nx = 0; for (int i = 0; i < gf->n_nodes; ++i) { if (gf->nodes[i]->is_param) { - WSP_GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); + GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); - WSP_GGML_ASSERT(np < WSP_GGML_MAX_PARAMS); + GGML_ASSERT(np < GGML_MAX_PARAMS); ps[np++] = gf->nodes[i]; - nx += wsp_ggml_nelements(gf->nodes[i]); + nx += ggml_nelements(gf->nodes[i]); } } if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past)) { int iter = opt->iter; - wsp_ggml_opt_init(opt->ctx, opt, params, nx); + ggml_opt_init(opt->ctx, opt, params, nx); opt->iter = iter; } // constants - const float sched = params.adam.sched; - const float decay = params.adam.decay * sched; - const float alpha = params.adam.alpha * sched; + float sched = params.adam.sched; + const float alpha = params.adam.alpha; + const float decay = params.adam.decay * alpha; const float beta1 = params.adam.beta1; const float beta2 = params.adam.beta2; const float eps = params.adam.eps; + const float gclip = params.adam.gclip; + const int decay_min_ndim = params.adam.decay_min_ndim; - float * x = opt->adam.x->data; // view of the parameters - float * g1 = opt->adam.g1->data; // gradient - float * g2 = opt->adam.g2->data; // gradient squared float * m = opt->adam.m->data; // first moment float * v = opt->adam.v->data; // second moment - float * mh = opt->adam.mh->data; // first moment hat - float * vh = opt->adam.vh->data; // second moment hat float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values - // update view - wsp_ggml_opt_get_params(np, ps, x); + if (callback) { + callback(callback_data, &sched); + } // compute the function value - wsp_ggml_graph_reset (gf); - wsp_ggml_set_f32 (f->grad, 1.0f); - wsp_ggml_graph_compute(ctx, gb); + ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); - opt->adam.fx_prev = wsp_ggml_get_f32_1d(f, 0); + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; + ggml_graph_compute(gb, &cplan); + + opt->adam.fx_prev = ggml_get_f32_1d(f, 0); opt->adam.fx_best = opt->adam.fx_prev; if (pf) { pf[opt->iter % params.past] = opt->adam.fx_prev; } + opt->loss_before = opt->adam.fx_prev; + opt->loss_after = opt->adam.fx_prev; + // initialize if (opt->just_initialized) { opt->adam.n_no_improvement = 0; @@ -17700,72 +18669,78 @@ static enum wsp_ggml_opt_result wsp_ggml_opt_adam( // run the optimizer for (int t = 0; t < params.adam.n_iter; ++t) { opt->iter = iter0 + t + 1; - WSP_GGML_PRINT_DEBUG ("=== iter %d ===\n", t); + GGML_PRINT_DEBUG ("=== iter %d ===\n", t); - WSP_GGML_PRINT_DEBUG ("f = %10.6f\n", wsp_ggml_get_f32_1d(f, 0)); - WSP_GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", wsp_ggml_get_f32_1d(ps[0]->grad, 0)); - WSP_GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", wsp_ggml_get_f32_1d(ps[1]->grad, 0)); + GGML_PRINT_DEBUG ("f = %10.6f\n", ggml_get_f32_1d(f, 0)); + GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_get_f32_1d(ps[0]->grad, 0)); + GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_get_f32_1d(ps[1]->grad, 0)); for (int i = 0; i < np; ++i) { - WSP_GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i, - wsp_ggml_get_f32_1d(ps[i], 0), wsp_ggml_get_f32_1d(ps[i]->grad, 0)); + GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i, + ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0)); } - const int64_t t_start_wall = wsp_ggml_time_us(); - const int64_t t_start_cpu = wsp_ggml_cycles(); + const int64_t t_start_wall = ggml_time_us(); + const int64_t t_start_cpu = ggml_cycles(); UNUSED(t_start_wall); UNUSED(t_start_cpu); { - // update the gradient - wsp_ggml_opt_get_grad(np, ps, g1); - - // m_t = beta1*m_t-1 + (1 - beta1)*g_t - wsp_ggml_vec_scale_f32(nx, m, beta1); - wsp_ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1); - - // g2 = g1^2 - wsp_ggml_vec_sqr_f32 (nx, g2, g1); - - // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2 - wsp_ggml_vec_scale_f32(nx, v, beta2); - wsp_ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2); - - // m^hat = m_t / (1 - beta1^t) - // v^hat = v_t / (1 - beta2^t) - // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1) - // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1 - // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps) - // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps) - // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay) - wsp_ggml_vec_cpy_f32 (nx, mh, m); - wsp_ggml_vec_cpy_f32 (nx, vh, v); - - wsp_ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter))); - wsp_ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter))); - - wsp_ggml_vec_sqrt_f32 (nx, vh, vh); - wsp_ggml_vec_acc1_f32 (nx, vh, eps); - - wsp_ggml_vec_div_f32 (nx, mh, mh, vh); - wsp_ggml_vec_scale_f32(nx, x, 1.0f - decay); - wsp_ggml_vec_sub_f32 (nx, x, x, mh); - - // update the parameters - wsp_ggml_opt_set_params(np, ps, x); + float gnorm = 1.0f; + if (gclip > 0.0f) { + // gradient clipping + ggml_float sum = 0.0; + for (int p = 0; p < np; ++p) { + const int64_t ne = ggml_nelements(ps[p]); + for (int64_t j = 0; j < ne; ++j) { + float g = ggml_get_f32_1d(ps[p]->grad, j); + sum += (ggml_float)(g*g); + } + } + ggml_float norm = sqrt(sum); + if (norm > (ggml_float) gclip) { + gnorm = (float) ((ggml_float) gclip / norm); + } + } + const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter)); + const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter)); + int64_t i = 0; + for (int p = 0; p < np; ++p) { + const int64_t ne = ggml_nelements(ps[p]); + const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched; + for (int64_t j = 0; j < ne; ++j) { + float x = ggml_get_f32_1d(ps[p], j); + float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm; + m[i] = m[i]*beta1 + g*(1.0f - beta1); + v[i] = v[i]*beta2 + g*g*(1.0f - beta2); + float mh = m[i]*beta1h; + float vh = v[i]*beta2h; + vh = sqrtf(vh) + eps; + x = x*(1.0f - p_decay) - mh/vh; + ggml_set_f32_1d(ps[p], j, x); + ++i; + } + } } - wsp_ggml_graph_reset (gf); - wsp_ggml_set_f32 (f->grad, 1.0f); - wsp_ggml_graph_compute(ctx, gb); + if (callback) { + callback(callback_data, &sched); + } + + ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); + + ggml_graph_compute(gb, &cplan); + + const float fx = ggml_get_f32_1d(f, 0); + opt->loss_after = fx; - const float fx = wsp_ggml_get_f32_1d(f, 0); // check convergence if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { - WSP_GGML_PRINT_DEBUG("converged\n"); + GGML_PRINT_DEBUG("converged\n"); - return WSP_GGML_OPT_OK; + return GGML_OPT_OK; } // delta-based convergence test @@ -17775,7 +18750,7 @@ static enum wsp_ggml_opt_result wsp_ggml_opt_adam( const float rate = (pf[(iter0 + t)%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { - return WSP_GGML_OPT_OK; + return GGML_OPT_OK; } } @@ -17791,7 +18766,7 @@ static enum wsp_ggml_opt_result wsp_ggml_opt_adam( ++n_no_improvement[0]; if (n_no_improvement[0] >= params.max_no_improvement) { - return WSP_GGML_OPT_OK; + return GGML_OPT_OK; } } } @@ -17799,17 +18774,17 @@ static enum wsp_ggml_opt_result wsp_ggml_opt_adam( fx_prev[0] = fx; { - const int64_t t_end_cpu = wsp_ggml_cycles(); - WSP_GGML_PRINT_DEBUG("time iter: %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC); + const int64_t t_end_cpu = ggml_cycles(); + GGML_PRINT_DEBUG("time iter: %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC); UNUSED(t_end_cpu); - const int64_t t_end_wall = wsp_ggml_time_us(); - WSP_GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6); + const int64_t t_end_wall = ggml_time_us(); + GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6); UNUSED(t_end_wall); } } - return WSP_GGML_OPT_DID_NOT_CONVERGE; + return GGML_OPT_DID_NOT_CONVERGE; } // @@ -17820,16 +18795,15 @@ static enum wsp_ggml_opt_result wsp_ggml_opt_adam( // https://github.com/chokkan/liblbfgs // -struct wsp_ggml_lbfgs_iteration_data { +struct ggml_lbfgs_iteration_data { float alpha; float ys; float * s; float * y; }; -static enum wsp_ggml_opt_result linesearch_backtracking( - struct wsp_ggml_context * ctx, - const struct wsp_ggml_opt_params * params, +static enum ggml_opt_result linesearch_backtracking( + const struct ggml_opt_params * params, int nx, float * x, float * fx, @@ -17837,11 +18811,14 @@ static enum wsp_ggml_opt_result linesearch_backtracking( float * d, float * step, const float * xp, - struct wsp_ggml_tensor * f, - struct wsp_ggml_cgraph * gf, - struct wsp_ggml_cgraph * gb, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + struct ggml_cplan * cplan, const int np, - struct wsp_ggml_tensor * ps[]) { + struct ggml_tensor * ps[], + ggml_opt_callback callback, + void * callback_data) { int count = 0; float width = 0.0f; @@ -17854,15 +18831,15 @@ static enum wsp_ggml_opt_result linesearch_backtracking( const float inc = 2.1f; if (*step <= 0.f) { - return WSP_GGML_LINESEARCH_INVALID_PARAMETERS; + return GGML_LINESEARCH_INVALID_PARAMETERS; } // compute the initial gradient in the search direction - wsp_ggml_vec_dot_f32(nx, &dginit, g, d); + ggml_vec_dot_f32(nx, &dginit, g, d); // make sure that d points to a descent direction if (0 < dginit) { - return WSP_GGML_LINESEARCH_FAIL; + return GGML_LINESEARCH_FAIL; } // initialize local variables @@ -17870,20 +18847,27 @@ static enum wsp_ggml_opt_result linesearch_backtracking( dgtest = params->lbfgs.ftol*dginit; while (true) { - wsp_ggml_vec_cpy_f32(nx, x, xp); - wsp_ggml_vec_mad_f32(nx, x, d, *step); + if (callback) { + // LBFG-S does not support learning rate -> ignore learning schedule + float sched = 0; + callback(callback_data, &sched); + } + + ggml_vec_cpy_f32(nx, x, xp); + ggml_vec_mad_f32(nx, x, d, *step); // evaluate the function and gradient values { - wsp_ggml_opt_set_params(np, ps, x); + ggml_opt_set_params(np, ps, x); - wsp_ggml_graph_reset (gf); - wsp_ggml_set_f32 (f->grad, 1.0f); - wsp_ggml_graph_compute(ctx, gb); + ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); - wsp_ggml_opt_get_grad(np, ps, g); + ggml_graph_compute(gb, cplan); - *fx = wsp_ggml_get_f32_1d(f, 0); + ggml_opt_get_grad(np, ps, g); + + *fx = ggml_get_f32_1d(f, 0); } ++count; @@ -17892,17 +18876,17 @@ static enum wsp_ggml_opt_result linesearch_backtracking( width = dec; } else { // Armijo condition is satisfied - if (params->lbfgs.linesearch == WSP_GGML_LINESEARCH_BACKTRACKING_ARMIJO) { + if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) { return count; } - wsp_ggml_vec_dot_f32(nx, &dg, g, d); + ggml_vec_dot_f32(nx, &dg, g, d); // check the Wolfe condition if (dg < params->lbfgs.wolfe * dginit) { width = inc; } else { - if(params->lbfgs.linesearch == WSP_GGML_LINESEARCH_BACKTRACKING_WOLFE) { + if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) { // regular Wolfe conditions return count; } @@ -17910,70 +18894,72 @@ static enum wsp_ggml_opt_result linesearch_backtracking( if(dg > -params->lbfgs.wolfe*dginit) { width = dec; } else { - // strong Wolfe condition (WSP_GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) + // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) return count; } - return count; } } if (*step < params->lbfgs.min_step) { - return WSP_GGML_LINESEARCH_MINIMUM_STEP; + return GGML_LINESEARCH_MINIMUM_STEP; } if (*step > params->lbfgs.max_step) { - return WSP_GGML_LINESEARCH_MAXIMUM_STEP; + return GGML_LINESEARCH_MAXIMUM_STEP; } if (params->lbfgs.max_linesearch <= count) { - return WSP_GGML_LINESEARCH_MAXIMUM_ITERATIONS; + return GGML_LINESEARCH_MAXIMUM_ITERATIONS; } (*step) *= width; } - return WSP_GGML_LINESEARCH_FAIL; + return GGML_LINESEARCH_FAIL; } -static enum wsp_ggml_opt_result wsp_ggml_opt_lbfgs( - struct wsp_ggml_context * ctx, - struct wsp_ggml_opt_context * opt, - struct wsp_ggml_opt_params params, - struct wsp_ggml_tensor * f, - struct wsp_ggml_cgraph * gf, - struct wsp_ggml_cgraph * gb) { - if (params.lbfgs.linesearch == WSP_GGML_LINESEARCH_BACKTRACKING_WOLFE || - params.lbfgs.linesearch == WSP_GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { +static enum ggml_opt_result ggml_opt_lbfgs( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data) { + if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE || + params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) { - return WSP_GGML_OPT_INVALID_WOLFE; + return GGML_OPT_INVALID_WOLFE; } } - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; - const int m = params.lbfgs.m; // these will store the parameters we want to optimize - struct wsp_ggml_tensor * ps[WSP_GGML_MAX_PARAMS]; + struct ggml_tensor * ps[GGML_MAX_PARAMS]; int np = 0; int nx = 0; for (int i = 0; i < gf->n_nodes; ++i) { if (gf->nodes[i]->is_param) { - WSP_GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); + GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); - WSP_GGML_ASSERT(np < WSP_GGML_MAX_PARAMS); + GGML_ASSERT(np < GGML_MAX_PARAMS); ps[np++] = gf->nodes[i]; - nx += wsp_ggml_nelements(gf->nodes[i]); + nx += ggml_nelements(gf->nodes[i]); } } if ((opt->params.type != params.type) || (opt->nx != nx) || (opt->params.past != params.past) || (opt->params.lbfgs.m != params.lbfgs.m)) { int iter = opt->iter; - wsp_ggml_opt_init(ctx, opt, params, nx); + ggml_opt_init(ctx, opt, params, nx); opt->iter = iter; } + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; + float * x = opt->lbfgs.x->data; // current parameters float * xp = opt->lbfgs.xp->data; // previous parameters float * g = opt->lbfgs.g->data; // current gradient @@ -17987,7 +18973,7 @@ static enum wsp_ggml_opt_result wsp_ggml_opt_lbfgs( float gnorm = 0.0f; // ||g|| // initialize x from the graph nodes - wsp_ggml_opt_get_params(np, ps, x); + ggml_opt_get_params(np, ps, x); // the L-BFGS memory float * lm_alpha = opt->lbfgs.lmal->data; @@ -17995,25 +18981,35 @@ static enum wsp_ggml_opt_result wsp_ggml_opt_lbfgs( float * lm_s = opt->lbfgs.lms->data; float * lm_y = opt->lbfgs.lmy->data; + if (callback) { + // LBFG-S does not support learning rate -> ignore learning schedule + float sched = 0; + callback(callback_data, &sched); + } + // evaluate the function value and its gradient { - wsp_ggml_opt_set_params(np, ps, x); + ggml_opt_set_params(np, ps, x); - wsp_ggml_graph_reset (gf); - wsp_ggml_set_f32 (f->grad, 1.0f); - wsp_ggml_graph_compute(ctx, gb); + ggml_graph_reset (gf); + ggml_set_f32 (f->grad, 1.0f); - wsp_ggml_opt_get_grad(np, ps, g); + ggml_graph_compute(gb, &cplan); - fx = wsp_ggml_get_f32_1d(f, 0); + ggml_opt_get_grad(np, ps, g); + + fx = ggml_get_f32_1d(f, 0); + + opt->loss_before = fx; + opt->loss_after = fx; } // search direction = -gradient - wsp_ggml_vec_neg_f32(nx, d, g); + ggml_vec_neg_f32(nx, d, g); // ||x||, ||g|| - wsp_ggml_vec_norm_f32(nx, &xnorm, x); - wsp_ggml_vec_norm_f32(nx, &gnorm, g); + ggml_vec_norm_f32(nx, &xnorm, x); + ggml_vec_norm_f32(nx, &gnorm, g); if (xnorm < 1.0f) { xnorm = 1.0f; @@ -18021,7 +19017,7 @@ static enum wsp_ggml_opt_result wsp_ggml_opt_lbfgs( // already optimized if (gnorm/xnorm <= params.lbfgs.eps) { - return WSP_GGML_OPT_OK; + return GGML_OPT_OK; } if (opt->just_initialized) { @@ -18031,7 +19027,7 @@ static enum wsp_ggml_opt_result wsp_ggml_opt_lbfgs( opt->lbfgs.fx_best = fx; // initial step - wsp_ggml_vec_norm_inv_f32(nx, &opt->lbfgs.step, d); + ggml_vec_norm_inv_f32(nx, &opt->lbfgs.step, d); opt->lbfgs.j = 0; opt->lbfgs.k = 1; opt->lbfgs.end = 0; @@ -18057,30 +19053,32 @@ static enum wsp_ggml_opt_result wsp_ggml_opt_lbfgs( while (true) { // store the current position and gradient vectors - wsp_ggml_vec_cpy_f32(nx, xp, x); - wsp_ggml_vec_cpy_f32(nx, gp, g); + ggml_vec_cpy_f32(nx, xp, x); + ggml_vec_cpy_f32(nx, gp, g); - ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps); + ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data); if (ls < 0) { // linesearch failed - go back to the previous point and return - wsp_ggml_vec_cpy_f32(nx, x, xp); - wsp_ggml_vec_cpy_f32(nx, g, gp); + ggml_vec_cpy_f32(nx, x, xp); + ggml_vec_cpy_f32(nx, g, gp); return ls; } - wsp_ggml_vec_norm_f32(nx, &xnorm, x); - wsp_ggml_vec_norm_f32(nx, &gnorm, g); + opt->loss_after = fx; - WSP_GGML_PRINT_DEBUG("f = %10.6f\n", wsp_ggml_get_f32_1d(f, 0)); + ggml_vec_norm_f32(nx, &xnorm, x); + ggml_vec_norm_f32(nx, &gnorm, g); + + GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0)); if (xnorm < 1.0f) { xnorm = 1.0f; } if (gnorm/xnorm <= params.lbfgs.eps) { // converged - return WSP_GGML_OPT_OK; + return GGML_OPT_OK; } // delta-based convergence test @@ -18090,7 +19088,7 @@ static enum wsp_ggml_opt_result wsp_ggml_opt_lbfgs( const float rate = (pf[k[0]%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { - return WSP_GGML_OPT_OK; + return GGML_OPT_OK; } } @@ -18106,29 +19104,29 @@ static enum wsp_ggml_opt_result wsp_ggml_opt_lbfgs( n_no_improvement[0]++; if (n_no_improvement[0] >= params.max_no_improvement) { - return WSP_GGML_OPT_OK; + return GGML_OPT_OK; } } } if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) { // reached the maximum number of iterations - return WSP_GGML_OPT_DID_NOT_CONVERGE; + return GGML_OPT_DID_NOT_CONVERGE; } // update vectors s and y: // s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}. // y_{k+1} = g_{k+1} - g_{k}. // - wsp_ggml_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp); - wsp_ggml_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp); + ggml_vec_sub_f32(nx, &lm_s[end[0]*nx], x, xp); + ggml_vec_sub_f32(nx, &lm_y[end[0]*nx], g, gp); // compute scalars ys and yy: // ys = y^t \cdot s -> 1 / \rho. // yy = y^t \cdot y. // - wsp_ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]); - wsp_ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]); + ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]); + ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]); lm_ys[end[0]] = ys; @@ -18141,43 +19139,43 @@ static enum wsp_ggml_opt_result wsp_ggml_opt_lbfgs( end[0] = (end[0] + 1)%m; // initialize search direction with -g - wsp_ggml_vec_neg_f32(nx, d, g); + ggml_vec_neg_f32(nx, d, g); j[0] = end[0]; for (int i = 0; i < bound; ++i) { j[0] = (j[0] + m - 1) % m; // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1} - wsp_ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d); + ggml_vec_dot_f32(nx, &lm_alpha[j[0]], &lm_s[j[0]*nx], d); lm_alpha[j[0]] /= lm_ys[j[0]]; // q_{i} = q_{i+1} - \alpha_{i} y_{i} - wsp_ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]); + ggml_vec_mad_f32(nx, d, &lm_y[j[0]*nx], -lm_alpha[j[0]]); } - wsp_ggml_vec_scale_f32(nx, d, ys/yy); + ggml_vec_scale_f32(nx, d, ys/yy); for (int i = 0; i < bound; ++i) { // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i} - wsp_ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d); + ggml_vec_dot_f32(nx, &beta, &lm_y[j[0]*nx], d); beta /= lm_ys[j[0]]; // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j} - wsp_ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta); + ggml_vec_mad_f32(nx, d, &lm_s[j[0]*nx], lm_alpha[j[0]] - beta); j[0] = (j[0] + 1)%m; } step[0] = 1.0; } - return WSP_GGML_OPT_DID_NOT_CONVERGE; + return GGML_OPT_DID_NOT_CONVERGE; } -struct wsp_ggml_opt_params wsp_ggml_opt_default_params(enum wsp_ggml_opt_type type) { - struct wsp_ggml_opt_params result; +struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { + struct ggml_opt_params result; switch (type) { - case WSP_GGML_OPT_ADAM: + case GGML_OPT_ADAM: { - result = (struct wsp_ggml_opt_params) { - .type = WSP_GGML_OPT_ADAM, + result = (struct ggml_opt_params) { + .type = GGML_OPT_ADAM, .n_threads = 1, .past = 0, .delta = 1e-5f, @@ -18190,20 +19188,22 @@ struct wsp_ggml_opt_params wsp_ggml_opt_default_params(enum wsp_ggml_opt_type ty .adam = { .n_iter = 10000, .sched = 1.000f, - .decay = 0.001f, + .decay = 0.0f, + .decay_min_ndim = 2, .alpha = 0.001f, .beta1 = 0.9f, .beta2 = 0.999f, .eps = 1e-8f, .eps_f = 1e-5f, .eps_g = 1e-3f, + .gclip = 0.0f, }, }; } break; - case WSP_GGML_OPT_LBFGS: + case GGML_OPT_LBFGS: { - result = (struct wsp_ggml_opt_params) { - .type = WSP_GGML_OPT_LBFGS, + result = (struct ggml_opt_params) { + .type = GGML_OPT_LBFGS, .n_threads = 1, .past = 0, .delta = 1e-5f, @@ -18224,7 +19224,7 @@ struct wsp_ggml_opt_params wsp_ggml_opt_default_params(enum wsp_ggml_opt_type ty .min_step = 1e-20f, .max_step = 1e+20f, - .linesearch = WSP_GGML_LINESEARCH_DEFAULT, + .linesearch = GGML_LINESEARCH_DEFAULT, }, }; } break; @@ -18233,10 +19233,10 @@ struct wsp_ggml_opt_params wsp_ggml_opt_default_params(enum wsp_ggml_opt_type ty return result; } -WSP_GGML_API void wsp_ggml_opt_init( - struct wsp_ggml_context * ctx, - struct wsp_ggml_opt_context * opt, - struct wsp_ggml_opt_params params, +GGML_API void ggml_opt_init( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, int64_t nx) { opt->ctx = ctx; opt->params = params; @@ -18244,140 +19244,132 @@ WSP_GGML_API void wsp_ggml_opt_init( opt->nx = nx; opt->just_initialized = true; switch (opt->params.type) { - case WSP_GGML_OPT_ADAM: - { - opt->adam.x = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, nx); - opt->adam.g1 = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, nx); - opt->adam.g2 = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, nx); - opt->adam.m = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, nx); - opt->adam.v = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, nx); - opt->adam.mh = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, nx); - opt->adam.vh = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, nx); + case GGML_OPT_ADAM: + { + opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); opt->adam.pf = params.past > 0 - ? wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, params.past) + ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past) : NULL; - wsp_ggml_set_zero(opt->adam.x); - wsp_ggml_set_zero(opt->adam.g1); - wsp_ggml_set_zero(opt->adam.g2); - wsp_ggml_set_zero(opt->adam.m); - wsp_ggml_set_zero(opt->adam.v); - wsp_ggml_set_zero(opt->adam.mh); - wsp_ggml_set_zero(opt->adam.vh); + ggml_set_zero(opt->adam.m); + ggml_set_zero(opt->adam.v); if (opt->adam.pf) { - wsp_ggml_set_zero(opt->adam.pf); + ggml_set_zero(opt->adam.pf); } } break; - case WSP_GGML_OPT_LBFGS: + case GGML_OPT_LBFGS: { - opt->lbfgs.x = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, nx); - opt->lbfgs.xp = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, nx); - opt->lbfgs.g = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, nx); - opt->lbfgs.gp = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, nx); - opt->lbfgs.d = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, nx); + opt->lbfgs.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); + opt->lbfgs.d = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); opt->lbfgs.pf = params.past > 0 - ? wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, params.past) + ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past) : NULL; - opt->lbfgs.lmal = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, params.lbfgs.m); - opt->lbfgs.lmys = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, params.lbfgs.m); - opt->lbfgs.lms = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, nx, params.lbfgs.m); - opt->lbfgs.lmy = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, nx, params.lbfgs.m); - wsp_ggml_set_zero(opt->lbfgs.x); - wsp_ggml_set_zero(opt->lbfgs.xp); - wsp_ggml_set_zero(opt->lbfgs.g); - wsp_ggml_set_zero(opt->lbfgs.gp); - wsp_ggml_set_zero(opt->lbfgs.d); + opt->lbfgs.lmal = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m); + opt->lbfgs.lmys = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.lbfgs.m); + opt->lbfgs.lms = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m); + opt->lbfgs.lmy = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, params.lbfgs.m); + ggml_set_zero(opt->lbfgs.x); + ggml_set_zero(opt->lbfgs.xp); + ggml_set_zero(opt->lbfgs.g); + ggml_set_zero(opt->lbfgs.gp); + ggml_set_zero(opt->lbfgs.d); if (opt->lbfgs.pf) { - wsp_ggml_set_zero(opt->lbfgs.pf); + ggml_set_zero(opt->lbfgs.pf); } - wsp_ggml_set_zero(opt->lbfgs.lmal); - wsp_ggml_set_zero(opt->lbfgs.lmys); - wsp_ggml_set_zero(opt->lbfgs.lms); - wsp_ggml_set_zero(opt->lbfgs.lmy); + ggml_set_zero(opt->lbfgs.lmal); + ggml_set_zero(opt->lbfgs.lmys); + ggml_set_zero(opt->lbfgs.lms); + ggml_set_zero(opt->lbfgs.lmy); } break; } } -enum wsp_ggml_opt_result wsp_ggml_opt( - struct wsp_ggml_context * ctx, - struct wsp_ggml_opt_params params, - struct wsp_ggml_tensor * f) { +enum ggml_opt_result ggml_opt( + struct ggml_context * ctx, + struct ggml_opt_params params, + struct ggml_tensor * f) { bool free_ctx = false; if (ctx == NULL) { - struct wsp_ggml_init_params params_ctx = { + struct ggml_init_params params_ctx = { .mem_size = 16*1024*1024, .mem_buffer = NULL, .no_alloc = false, }; - ctx = wsp_ggml_init(params_ctx); + ctx = ggml_init(params_ctx); if (ctx == NULL) { - return WSP_GGML_OPT_NO_CONTEXT; + return GGML_OPT_NO_CONTEXT; } free_ctx = true; } - enum wsp_ggml_opt_result result = WSP_GGML_OPT_OK; + enum ggml_opt_result result = GGML_OPT_OK; - struct wsp_ggml_opt_context * opt = (struct wsp_ggml_opt_context *) alloca(sizeof(struct wsp_ggml_opt_context)); + struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); - wsp_ggml_opt_init(ctx, opt, params, 0); - result = wsp_ggml_opt_resume(ctx, opt, f); + ggml_opt_init(ctx, opt, params, 0); + result = ggml_opt_resume(ctx, opt, f); if (free_ctx) { - wsp_ggml_free(ctx); + ggml_free(ctx); } return result; } -enum wsp_ggml_opt_result wsp_ggml_opt_resume( - struct wsp_ggml_context * ctx, - struct wsp_ggml_opt_context * opt, - struct wsp_ggml_tensor * f) { +enum ggml_opt_result ggml_opt_resume( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f) { // build forward + backward compute graphs - struct wsp_ggml_tensor * gfbuf = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, sizeof(struct wsp_ggml_cgraph) / WSP_GGML_TYPE_SIZE[WSP_GGML_TYPE_I32]+ (sizeof(struct wsp_ggml_cgraph) % WSP_GGML_TYPE_SIZE[WSP_GGML_TYPE_I32] ? 1 : 0)); - struct wsp_ggml_tensor * gbbuf = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, sizeof(struct wsp_ggml_cgraph) / WSP_GGML_TYPE_SIZE[WSP_GGML_TYPE_I32]+ (sizeof(struct wsp_ggml_cgraph) % WSP_GGML_TYPE_SIZE[WSP_GGML_TYPE_I32] ? 1 : 0)); + struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); + struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); - struct wsp_ggml_cgraph * gf = (struct wsp_ggml_cgraph *) gfbuf->data; - struct wsp_ggml_cgraph * gb = (struct wsp_ggml_cgraph *) gbbuf->data; + struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; + struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; - *gf = wsp_ggml_build_forward (f); - *gb = wsp_ggml_build_backward(ctx, gf, true); + *gf = ggml_build_forward (f); + *gb = ggml_build_backward(ctx, gf, true); - return wsp_ggml_opt_resume_g(ctx, opt, f, gf, gb); + return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL); } -enum wsp_ggml_opt_result wsp_ggml_opt_resume_g( - struct wsp_ggml_context * ctx, - struct wsp_ggml_opt_context * opt, - struct wsp_ggml_tensor * f, - struct wsp_ggml_cgraph * gf, - struct wsp_ggml_cgraph * gb) { +enum ggml_opt_result ggml_opt_resume_g( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data) { // build forward + backward compute graphs - enum wsp_ggml_opt_result result = WSP_GGML_OPT_OK; + enum ggml_opt_result result = GGML_OPT_OK; switch (opt->params.type) { - case WSP_GGML_OPT_ADAM: + case GGML_OPT_ADAM: { - result = wsp_ggml_opt_adam(ctx, opt, opt->params, f, gf, gb); + result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data); } break; - case WSP_GGML_OPT_LBFGS: + case GGML_OPT_LBFGS: { - result = wsp_ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb); + result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data); } break; } if (opt->params.print_forward_graph) { - wsp_ggml_graph_print (gf); - wsp_ggml_graph_dump_dot(gf, NULL, "opt-forward.dot"); + ggml_graph_print (gf); + ggml_graph_dump_dot(gf, NULL, "opt-forward.dot"); } if (opt->params.print_backward_graph) { - wsp_ggml_graph_print (gb); - wsp_ggml_graph_dump_dot(gb, gf, "opt-backward.dot"); + ggml_graph_print (gb); + ggml_graph_dump_dot(gb, gf, "opt-backward.dot"); } return result; @@ -18385,7 +19377,7 @@ enum wsp_ggml_opt_result wsp_ggml_opt_resume_g( //////////////////////////////////////////////////////////////////////////////// -size_t wsp_ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK4_0 == 0); const int nb = k / QK4_0; @@ -18408,7 +19400,7 @@ size_t wsp_ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64 return (n/QK4_0*sizeof(block_q4_0)); } -size_t wsp_ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK4_1 == 0); const int nb = k / QK4_1; @@ -18431,7 +19423,7 @@ size_t wsp_ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64 return (n/QK4_1*sizeof(block_q4_1)); } -size_t wsp_ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK5_0 == 0); const int nb = k / QK5_0; @@ -18461,7 +19453,7 @@ size_t wsp_ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64 return (n/QK5_0*sizeof(block_q5_0)); } -size_t wsp_ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK5_1 == 0); const int nb = k / QK5_1; @@ -18491,7 +19483,7 @@ size_t wsp_ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64 return (n/QK5_1*sizeof(block_q5_1)); } -size_t wsp_ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) { assert(k % QK8_0 == 0); const int nb = k / QK8_0; @@ -18512,78 +19504,78 @@ size_t wsp_ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64 return (n/QK8_0*sizeof(block_q8_0)); } -size_t wsp_ggml_quantize_chunk(enum wsp_ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { +size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { size_t result = 0; switch (type) { - case WSP_GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_0: { - WSP_GGML_ASSERT(start % QK4_0 == 0); + GGML_ASSERT(start % QK4_0 == 0); block_q4_0 * block = (block_q4_0*)dst + start / QK4_0; - result = wsp_ggml_quantize_q4_0(src + start, block, n, n, hist); + result = ggml_quantize_q4_0(src + start, block, n, n, hist); } break; - case WSP_GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_1: { - WSP_GGML_ASSERT(start % QK4_1 == 0); + GGML_ASSERT(start % QK4_1 == 0); block_q4_1 * block = (block_q4_1*)dst + start / QK4_1; - result = wsp_ggml_quantize_q4_1(src + start, block, n, n, hist); + result = ggml_quantize_q4_1(src + start, block, n, n, hist); } break; - case WSP_GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_0: { - WSP_GGML_ASSERT(start % QK5_0 == 0); + GGML_ASSERT(start % QK5_0 == 0); block_q5_0 * block = (block_q5_0*)dst + start / QK5_0; - result = wsp_ggml_quantize_q5_0(src + start, block, n, n, hist); + result = ggml_quantize_q5_0(src + start, block, n, n, hist); } break; - case WSP_GGML_TYPE_Q5_1: + case GGML_TYPE_Q5_1: { - WSP_GGML_ASSERT(start % QK5_1 == 0); + GGML_ASSERT(start % QK5_1 == 0); block_q5_1 * block = (block_q5_1*)dst + start / QK5_1; - result = wsp_ggml_quantize_q5_1(src + start, block, n, n, hist); + result = ggml_quantize_q5_1(src + start, block, n, n, hist); } break; - case WSP_GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_0: { - WSP_GGML_ASSERT(start % QK8_0 == 0); + GGML_ASSERT(start % QK8_0 == 0); block_q8_0 * block = (block_q8_0*)dst + start / QK8_0; - result = wsp_ggml_quantize_q8_0(src + start, block, n, n, hist); + result = ggml_quantize_q8_0(src + start, block, n, n, hist); } break; -#ifdef WSP_GGML_USE_K_QUANTS - case WSP_GGML_TYPE_Q2_K: +#ifdef GGML_USE_K_QUANTS + case GGML_TYPE_Q2_K: { - WSP_GGML_ASSERT(start % QK_K == 0); + GGML_ASSERT(start % QK_K == 0); block_q2_K * block = (block_q2_K*)dst + start / QK_K; - result = wsp_ggml_quantize_q2_K(src + start, block, n, n, hist); + result = ggml_quantize_q2_K(src + start, block, n, n, hist); } break; - case WSP_GGML_TYPE_Q3_K: + case GGML_TYPE_Q3_K: { - WSP_GGML_ASSERT(start % QK_K == 0); + GGML_ASSERT(start % QK_K == 0); block_q3_K * block = (block_q3_K*)dst + start / QK_K; - result = wsp_ggml_quantize_q3_K(src + start, block, n, n, hist); + result = ggml_quantize_q3_K(src + start, block, n, n, hist); } break; - case WSP_GGML_TYPE_Q4_K: + case GGML_TYPE_Q4_K: { - WSP_GGML_ASSERT(start % QK_K == 0); + GGML_ASSERT(start % QK_K == 0); block_q4_K * block = (block_q4_K*)dst + start / QK_K; - result = wsp_ggml_quantize_q4_K(src + start, block, n, n, hist); + result = ggml_quantize_q4_K(src + start, block, n, n, hist); } break; - case WSP_GGML_TYPE_Q5_K: + case GGML_TYPE_Q5_K: { - WSP_GGML_ASSERT(start % QK_K == 0); + GGML_ASSERT(start % QK_K == 0); block_q5_K * block = (block_q5_K*)dst + start / QK_K; - result = wsp_ggml_quantize_q5_K(src + start, block, n, n, hist); + result = ggml_quantize_q5_K(src + start, block, n, n, hist); } break; - case WSP_GGML_TYPE_Q6_K: + case GGML_TYPE_Q6_K: { - WSP_GGML_ASSERT(start % QK_K == 0); + GGML_ASSERT(start % QK_K == 0); block_q6_K * block = (block_q6_K*)dst + start / QK_K; - result = wsp_ggml_quantize_q6_K(src + start, block, n, n, hist); + result = ggml_quantize_q6_K(src + start, block, n, n, hist); } break; #endif - case WSP_GGML_TYPE_F16: + case GGML_TYPE_F16: { - int elemsize = sizeof(wsp_ggml_fp16_t); - wsp_ggml_fp32_to_fp16_row(src + start, (wsp_ggml_fp16_t *)dst + start, n); + int elemsize = sizeof(ggml_fp16_t); + ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n); result = n * elemsize; } break; - case WSP_GGML_TYPE_F32: + case GGML_TYPE_F32: { int elemsize = sizeof(float); result = n * elemsize; @@ -18597,71 +19589,1179 @@ size_t wsp_ggml_quantize_chunk(enum wsp_ggml_type type, const float * src, void //////////////////////////////////////////////////////////////////////////////// -int wsp_ggml_cpu_has_avx(void) { -#if defined(__AVX__) - return 1; -#else - return 0; -#endif -} +struct gguf_str { + uint64_t n; // GGUFv2 + char * data; +}; -int wsp_ggml_cpu_has_avx2(void) { -#if defined(__AVX2__) - return 1; -#else - return 0; -#endif -} +static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = { + [GGUF_TYPE_UINT8] = sizeof(uint8_t), + [GGUF_TYPE_INT8] = sizeof(int8_t), + [GGUF_TYPE_UINT16] = sizeof(uint16_t), + [GGUF_TYPE_INT16] = sizeof(int16_t), + [GGUF_TYPE_UINT32] = sizeof(uint32_t), + [GGUF_TYPE_INT32] = sizeof(int32_t), + [GGUF_TYPE_FLOAT32] = sizeof(float), + [GGUF_TYPE_BOOL] = sizeof(bool), + [GGUF_TYPE_STRING] = sizeof(struct gguf_str), + [GGUF_TYPE_UINT64] = sizeof(uint64_t), + [GGUF_TYPE_INT64] = sizeof(int64_t), + [GGUF_TYPE_FLOAT64] = sizeof(double), + [GGUF_TYPE_ARRAY] = 0, // undefined +}; +static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); + +static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { + [GGUF_TYPE_UINT8] = "u8", + [GGUF_TYPE_INT8] = "i8", + [GGUF_TYPE_UINT16] = "u16", + [GGUF_TYPE_INT16] = "i16", + [GGUF_TYPE_UINT32] = "u32", + [GGUF_TYPE_INT32] = "i32", + [GGUF_TYPE_FLOAT32] = "f32", + [GGUF_TYPE_BOOL] = "bool", + [GGUF_TYPE_STRING] = "str", + [GGUF_TYPE_ARRAY] = "arr", + [GGUF_TYPE_UINT64] = "u64", + [GGUF_TYPE_INT64] = "i64", + [GGUF_TYPE_FLOAT64] = "f64", +}; +static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); + +union gguf_value { + uint8_t uint8; + int8_t int8; + uint16_t uint16; + int16_t int16; + uint32_t uint32; + int32_t int32; + float float32; + uint64_t uint64; + int64_t int64; + double float64; + bool bool_; + + struct gguf_str str; + + struct { + enum gguf_type type; + + uint64_t n; // GGUFv2 + void * data; + } arr; +}; -int wsp_ggml_cpu_has_avx512(void) { -#if defined(__AVX512F__) - return 1; -#else - return 0; -#endif -} +struct gguf_kv { + struct gguf_str key; -int wsp_ggml_cpu_has_avx512_vbmi(void) { -#if defined(__AVX512VBMI__) - return 1; -#else - return 0; -#endif + enum gguf_type type; + union gguf_value value; +}; + +struct gguf_header { + uint32_t magic; + uint32_t version; + uint64_t n_tensors; // GGUFv2 + uint64_t n_kv; // GGUFv2 +}; + +struct gguf_tensor_info { + struct gguf_str name; + + uint32_t n_dims; + uint64_t ne[GGML_MAX_DIMS]; + + enum ggml_type type; + + uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT` + + // for writing API + const void * data; + size_t size; +}; + +struct gguf_context { + struct gguf_header header; + + struct gguf_kv * kv; + struct gguf_tensor_info * infos; + + size_t alignment; + size_t offset; // offset of `data` from beginning of file + size_t size; // size of `data` in bytes + + //uint8_t * padding; + void * data; +}; + +static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) { + const size_t n = fread(dst, 1, size, file); + *offset += n; + return n == size; } -int wsp_ggml_cpu_has_avx512_vnni(void) { -#if defined(__AVX512VNNI__) - return 1; -#else - return 0; -#endif +// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 +static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) { + p->n = 0; + p->data = NULL; + + bool ok = true; + + ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1); + ok = ok && gguf_fread_el(file, p->data, p->n, offset); + + return ok; } -int wsp_ggml_cpu_has_fma(void) { -#if defined(__FMA__) - return 1; -#else - return 0; -#endif +static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) { + p->n = 0; + p->data = NULL; + + bool ok = true; + + uint32_t n = 0; + ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n; + ok = ok && gguf_fread_el(file, p->data, p->n, offset); + + return ok; } -int wsp_ggml_cpu_has_neon(void) { -#if defined(__ARM_NEON) - return 1; -#else - return 0; -#endif +struct gguf_context * gguf_init_empty(void) { + struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + + ctx->header.magic = GGUF_MAGIC; + ctx->header.version = GGUF_VERSION; + ctx->header.n_tensors = 0; + ctx->header.n_kv = 0; + + ctx->kv = NULL; + ctx->infos = NULL; + + ctx->alignment = GGUF_DEFAULT_ALIGNMENT; + ctx->offset = 0; + ctx->size = 0; + + ctx->data = NULL; + + return ctx; } -int wsp_ggml_cpu_has_arm_fma(void) { -#if defined(__ARM_FEATURE_FMA) +struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { + FILE * file = fopen(fname, "rb"); + if (!file) { + return NULL; + } + + // offset from start of file + size_t offset = 0; + + uint32_t magic = 0; + + // check the magic before making allocations + { + gguf_fread_el(file, &magic, sizeof(magic), &offset); + + if (magic != GGUF_MAGIC) { + fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic); + fclose(file); + return NULL; + } + } + + bool ok = true; + + struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + + // read the header + { + ctx->header.magic = magic; + + ctx->kv = NULL; + ctx->infos = NULL; + ctx->data = NULL; + + ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset); + + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t n_tensors = 0; + uint32_t n_kv = 0; + + ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset); + ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset); + + ctx->header.n_tensors = n_tensors; + ctx->header.n_kv = n_kv; + } else { + ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); + ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); + } + + if (!ok) { + fprintf(stderr, "%s: failed to read header\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } + } + + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur; + if (ctx->header.version == 1) { + gguf_fread_str = gguf_fread_str_v1; + } + + // read the kv pairs + { + ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); + + for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + struct gguf_kv * kv = &ctx->kv[i]; + + //fprintf(stderr, "%s: reading kv %d\n", __func__, i); + + ok = ok && gguf_fread_str(file, &kv->key, &offset); + ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset); + + //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data); + + switch (kv->type) { + case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break; + case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break; + case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break; + case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break; + case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break; + case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break; + case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break; + case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break; + case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break; + case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break; + case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break; + case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break; + case GGUF_TYPE_ARRAY: + { + ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); + + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t n = 0; + ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset); + kv->value.arr.n = n; + } else { + ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); + } + + switch (kv->value.arr.type) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: + case GGUF_TYPE_BOOL: + { + kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); + ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset); + } break; + case GGUF_TYPE_STRING: + { + kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str)); + for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); + } + } break; + case GGUF_TYPE_ARRAY: + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + }; + } break; + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); + }; + + if (!ok) { + break; + } + } + + if (!ok) { + fprintf(stderr, "%s: failed to read key-value pairs\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } + } + + // read the tensor infos + { + ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); + + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + info->ne[j] = 1; + } + + ok = ok && gguf_fread_str(file, &info->name, &offset); + ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset); + for (uint32_t j = 0; j < info->n_dims; ++j) { + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t t = 0; + ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset); + info->ne[j] = t; + } else { + ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); + } + } + ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); + ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); + + if (!ok) { + fprintf(stderr, "%s: failed to read tensor info\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } + } + } + + ctx->alignment = GGUF_DEFAULT_ALIGNMENT; + + int alignment_idx = gguf_find_key(ctx, "general.alignment"); + if (alignment_idx != -1) { + ctx->alignment = gguf_get_val_u32(ctx, alignment_idx); + } + + // we require the data section to be aligned, so take into account any padding + { + const size_t offset_pad = offset % ctx->alignment; + + if (offset_pad != 0) { + offset += ctx->alignment - offset_pad; + fseek(file, offset, SEEK_SET); + } + } + + // store the current file offset - this is where the data section starts + ctx->offset = offset; + + // compute the total size of the data section, taking into account the alignment + { + ctx->size = 0; + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + const int64_t ne = + (int64_t) info->ne[0] * + (int64_t) info->ne[1] * + (int64_t) info->ne[2] * + (int64_t) info->ne[3]; + + if (ne % ggml_blck_size(info->type) != 0) { + fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n", + __func__, info->name.data, ne, ggml_blck_size(info->type)); + fclose(file); + gguf_free(ctx); + return NULL; + } + + const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type); + + ctx->size += GGML_PAD(size_cur, ctx->alignment); + } + } + + // load the tensor data only if requested + if (params.ctx != NULL) { + // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob + // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of + // the ggml_tensor structs to the appropriate locations in the binary blob + + // compute the exact size needed for the new ggml_context + const size_t mem_size = + params.no_alloc ? + (ctx->header.n_tensors )*ggml_tensor_overhead() : + (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size; + + struct ggml_init_params pdata = { + .mem_size = mem_size, + .mem_buffer = NULL, + .no_alloc = params.no_alloc, + }; + + *params.ctx = ggml_init(pdata); + + struct ggml_context * ctx_data = *params.ctx; + + struct ggml_tensor * data = NULL; + + if (!params.no_alloc) { + data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size); + + ok = ok && data != NULL; + + // read the binary blob with the tensor data + ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset); + + if (!ok) { + fprintf(stderr, "%s: failed to read tensor data\n", __func__); + fclose(file); + ggml_free(ctx_data); + gguf_free(ctx); + return NULL; + } + + ctx->data = data->data; + } + + ggml_set_no_alloc(ctx_data, true); + + // create the tensors + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + const int64_t ne[GGML_MAX_DIMS] = { + ctx->infos[i].ne[0], + ctx->infos[i].ne[1], + ctx->infos[i].ne[2], + ctx->infos[i].ne[3], + }; + + struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne); + + ok = ok && cur != NULL; + + ggml_set_name(cur, ctx->infos[i].name.data); + + if (!ok) { + break; + } + + // point the data member to the appropriate location in the binary blob using the tensor infos + if (!params.no_alloc) { + //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file + cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data + } + } + + if (!ok) { + fprintf(stderr, "%s: failed to read the tensor data\n", __func__); + fclose(file); + ggml_free(ctx_data); + gguf_free(ctx); + return NULL; + } + + ggml_set_no_alloc(ctx_data, params.no_alloc); + } + + fclose(file); + + return ctx; +} + +void gguf_free(struct gguf_context * ctx) { + if (ctx == NULL) { + return; + } + + if (ctx->kv) { + // free string memory - not great.. + for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + struct gguf_kv * kv = &ctx->kv[i]; + + if (kv->key.data) { + free(kv->key.data); + } + + if (kv->type == GGUF_TYPE_STRING) { + if (kv->value.str.data) { + free(kv->value.str.data); + } + } + + if (kv->type == GGUF_TYPE_ARRAY) { + if (kv->value.arr.data) { + if (kv->value.arr.type == GGUF_TYPE_STRING) { + for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j]; + if (str->data) { + free(str->data); + } + } + } + free(kv->value.arr.data); + } + } + } + + free(ctx->kv); + } + + if (ctx->infos) { + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + if (info->name.data) { + free(info->name.data); + } + } + + free(ctx->infos); + } + + GGML_ALIGNED_FREE(ctx); +} + +const char * gguf_type_name(enum gguf_type type) { + return GGUF_TYPE_NAME[type]; +} + +int gguf_get_version(const struct gguf_context * ctx) { + return ctx->header.version; +} + +size_t gguf_get_alignment(const struct gguf_context * ctx) { + return ctx->alignment; +} + +size_t gguf_get_data_offset(const struct gguf_context * ctx) { + return ctx->offset; +} + +void * gguf_get_data(const struct gguf_context * ctx) { + return ctx->data; +} + +int gguf_get_n_kv(const struct gguf_context * ctx) { + return ctx->header.n_kv; +} + +int gguf_find_key(const struct gguf_context * ctx, const char * key) { + // return -1 if key not found + int keyfound = -1; + + const int n_kv = gguf_get_n_kv(ctx); + + for (int i = 0; i < n_kv; ++i) { + if (strcmp(key, gguf_get_key(ctx, i)) == 0) { + keyfound = i; + break; + } + } + + return keyfound; +} + +const char * gguf_get_key(const struct gguf_context * ctx, int i) { + return ctx->kv[i].key.data; +} + +enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int i) { + return ctx->kv[i].type; +} + +enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.arr.type; +} + +const void * gguf_get_arr_data(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.arr.data; +} + +const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) { + struct gguf_kv * kv = &ctx->kv[key_id]; + struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i]; + return str->data; +} + +int gguf_get_arr_n(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.arr.n; +} + +uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint8; +} + +int8_t gguf_get_val_i8(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int8; +} + +uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint16; +} + +int16_t gguf_get_val_i16(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int16; +} + +uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint32; +} + +int32_t gguf_get_val_i32(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int32; +} + +float gguf_get_val_f32(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.float32; +} + +uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint64; +} + +int64_t gguf_get_val_i64(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int64; +} + +double gguf_get_val_f64(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.float64; +} + +bool gguf_get_val_bool(const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.bool_; +} + +const char * gguf_get_val_str (const struct gguf_context * ctx, int i) { + return ctx->kv[i].value.str.data; +} + +int gguf_get_n_tensors(const struct gguf_context * ctx) { + return ctx->header.n_tensors; +} + +int gguf_find_tensor(const struct gguf_context * ctx, const char * name) { + // return -1 if tensor not found + int tensorfound = -1; + + const int n_tensors = gguf_get_n_tensors(ctx); + + for (int i = 0; i < n_tensors; ++i) { + if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) { + tensorfound = i; + break; + } + } + + return tensorfound; +} + +size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) { + return ctx->infos[i].offset; +} + +char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) { + return ctx->infos[i].name.data; +} + +// returns the index +static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { + const int idx = gguf_find_key(ctx, key); + if (idx >= 0) { + return idx; + } + + const int n_kv = gguf_get_n_kv(ctx); + + ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv)); + ctx->kv[n_kv].key.n = strlen(key); + ctx->kv[n_kv].key.data = strdup(key); + ctx->header.n_kv++; + + return n_kv; +} + +void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT8; + ctx->kv[idx].value.uint8 = val; +} + +void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT8; + ctx->kv[idx].value.int8 = val; +} + +void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT16; + ctx->kv[idx].value.uint16 = val; +} + +void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT16; + ctx->kv[idx].value.int16 = val; +} + +void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT32; + ctx->kv[idx].value.uint32 = val; +} + +void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT32; + ctx->kv[idx].value.int32 = val; +} + +void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_FLOAT32; + ctx->kv[idx].value.float32 = val; +} + +void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT64; + ctx->kv[idx].value.uint64 = val; +} + +void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT64; + ctx->kv[idx].value.int64 = val; +} + +void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_FLOAT64; + ctx->kv[idx].value.float64 = val; +} + +void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_BOOL; + ctx->kv[idx].value.bool_ = val; +} + +void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_STRING; + ctx->kv[idx].value.str.n = strlen(val); + ctx->kv[idx].value.str.data = strdup(val); +} + +void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_ARRAY; + ctx->kv[idx].value.arr.type = type; + ctx->kv[idx].value.arr.n = n; + ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]); + memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]); +} + +void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_ARRAY; + ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING; + ctx->kv[idx].value.arr.n = n; + ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str)); + for (int i = 0; i < n; i++) { + struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i]; + str->n = strlen(data[i]); + str->data = strdup(data[i]); + } +} + +// set or add KV pairs from another context +void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { + for (uint32_t i = 0; i < src->header.n_kv; i++) { + switch (src->kv[i].type) { + case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break; + case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break; + case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break; + case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break; + case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break; + case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break; + case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break; + case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break; + case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break; + case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break; + case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break; + case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break; + case GGUF_TYPE_ARRAY: + { + if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) { + const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *)); + for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { + data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data; + } + gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n); + free(data); + } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) { + GGML_ASSERT(false && "nested arrays not supported"); + } else { + gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n); + } + } break; + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + } + } +} + +void gguf_add_tensor( + struct gguf_context * ctx, + const struct ggml_tensor * tensor) { + const int idx = ctx->header.n_tensors; + ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); + + ctx->infos[idx].name.n = strlen(tensor->name); + ctx->infos[idx].name.data = strdup(tensor->name); + + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + ctx->infos[idx].ne[i] = 1; + } + + ctx->infos[idx].n_dims = tensor->n_dims; + for (int i = 0; i < tensor->n_dims; i++) { + ctx->infos[idx].ne[i] = tensor->ne[i]; + } + + ctx->infos[idx].type = tensor->type; + ctx->infos[idx].offset = 0; + ctx->infos[idx].data = tensor->data; + ctx->infos[idx].size = ggml_nbytes(tensor); + + if (ctx->header.n_tensors > 0) { + ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment); + } + + ctx->header.n_tensors++; +} + +void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) { + const int idx = gguf_find_tensor(ctx, name); + if (idx < 0) { + GGML_ASSERT(false && "tensor not found"); + } + + ctx->infos[idx].type = type; +} + +void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) { + const int idx = gguf_find_tensor(ctx, name); + if (idx < 0) { + GGML_ASSERT(false && "tensor not found"); + } + + ctx->infos[idx].data = data; + ctx->infos[idx].size = size; + + // update offsets + for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) { + ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment); + } +} + +//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) { +// fwrite(&val->n, sizeof(val->n), 1, file); +// fwrite(val->data, sizeof(char), val->n, file); +//} +// +//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) { +// fwrite(val, sizeof(char), size, file); +//} + +struct gguf_buf { + void * data; + size_t size; + size_t offset; +}; + +static struct gguf_buf gguf_buf_init(size_t size) { + struct gguf_buf buf = { + /*buf.data =*/ size == 0 ? NULL : malloc(size), + /*buf.size =*/ size, + /*buf.offset =*/ 0, + }; + + return buf; +} + +static void gguf_buf_free(struct gguf_buf buf) { + if (buf.data) { + free(buf.data); + } +} + +static void gguf_buf_grow(struct gguf_buf * buf, size_t size) { + if (buf->offset + size > buf->size) { + buf->size = 1.5*(buf->offset + size); + if (buf->data) { + buf->data = realloc(buf->data, buf->size); + } + } +} + +static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) { + gguf_buf_grow(buf, sizeof(val->n) + val->n); + + if (buf->data) { + memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n)); + } + buf->offset += sizeof(val->n); + + if (buf->data) { + memcpy((char *) buf->data + buf->offset, val->data, val->n); + } + buf->offset += val->n; +} + +static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) { + gguf_buf_grow(buf, el_size); + + if (buf->data) { + memcpy((char *) buf->data + buf->offset, val, el_size); + } + buf->offset += el_size; +} + +static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) { + // write header + gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic)); + gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version)); + gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors)); + gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv)); + + // write key-value pairs + for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + struct gguf_kv * kv = &ctx->kv[i]; + + gguf_bwrite_str(buf, &kv->key); + gguf_bwrite_el (buf, &kv->type, sizeof(kv->type)); + + switch (kv->type) { + case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break; + case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break; + case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break; + case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break; + case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break; + case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break; + case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break; + case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break; + case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break; + case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break; + case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break; + case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break; + case GGUF_TYPE_ARRAY: + { + gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type)); + gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) ); + + switch (kv->value.arr.type) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: + case GGUF_TYPE_BOOL: + { + gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); + } break; + case GGUF_TYPE_STRING: + { + for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]); + } + } break; + case GGUF_TYPE_ARRAY: + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + }; + } break; + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); + }; + } + + // write tensor infos + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + gguf_bwrite_str(buf, &info->name); + gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims)); + for (uint32_t j = 0; j < info->n_dims; ++j) { + gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j])); + } + gguf_bwrite_el(buf, &info->type, sizeof(info->type)); + gguf_bwrite_el(buf, &info->offset, sizeof(info->offset)); + } + + // we require the data section to be aligned, so take into account any padding + { + const size_t offset = buf->offset; + const size_t offset_pad = GGML_PAD(offset, ctx->alignment); + + if (offset_pad != offset) { + uint8_t pad = 0; + for (size_t i = 0; i < offset_pad - offset; ++i) { + gguf_bwrite_el(buf, &pad, sizeof(pad)); + } + } + } + + if (only_meta) { + return; + } + + size_t offset = 0; + + // write tensor data + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + const size_t size = info->size; + const size_t size_pad = GGML_PAD(size, ctx->alignment); + + gguf_bwrite_el(buf, info->data, size); + + if (size_pad != size) { + uint8_t pad = 0; + for (size_t j = 0; j < size_pad - size; ++j) { + gguf_bwrite_el(buf, &pad, sizeof(pad)); + } + } + + GGML_ASSERT(offset == info->offset); + + offset += size_pad; + } +} + +void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) { + FILE * file = fopen(fname, "wb"); + if (!file) { + GGML_ASSERT(false && "failed to open file for writing"); + } + + struct gguf_buf buf = gguf_buf_init(16*1024); + + gguf_write_to_buf(ctx, &buf, only_meta); + + fwrite(buf.data, 1, buf.offset, file); + + gguf_buf_free(buf); + + fclose(file); +} + +size_t gguf_get_meta_size(const struct gguf_context * ctx) { + // no allocs - only compute size + struct gguf_buf buf = gguf_buf_init(0); + + gguf_write_to_buf(ctx, &buf, true); + + return buf.offset; +} + +void gguf_get_meta_data(const struct gguf_context * ctx, void * data) { + struct gguf_buf buf = gguf_buf_init(16*1024); + + gguf_write_to_buf(ctx, &buf, true); + + memcpy(data, buf.data, buf.offset); + + gguf_buf_free(buf); +} + +//////////////////////////////////////////////////////////////////////////////// + +int ggml_cpu_has_avx(void) { +#if defined(__AVX__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx2(void) { +#if defined(__AVX2__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx512(void) { +#if defined(__AVX512F__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx512_vbmi(void) { +#if defined(__AVX512VBMI__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_avx512_vnni(void) { +#if defined(__AVX512VNNI__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_fma(void) { +#if defined(__FMA__) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_neon(void) { +#if defined(__ARM_NEON) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_arm_fma(void) { +#if defined(__ARM_FEATURE_FMA) + return 1; +#else + return 0; +#endif +} + +int ggml_cpu_has_metal(void) { +#if defined(GGML_USE_METAL) return 1; #else return 0; #endif } -int wsp_ggml_cpu_has_f16c(void) { +int ggml_cpu_has_f16c(void) { #if defined(__F16C__) return 1; #else @@ -18669,7 +20769,7 @@ int wsp_ggml_cpu_has_f16c(void) { #endif } -int wsp_ggml_cpu_has_fp16_va(void) { +int ggml_cpu_has_fp16_va(void) { #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) return 1; #else @@ -18677,7 +20777,7 @@ int wsp_ggml_cpu_has_fp16_va(void) { #endif } -int wsp_ggml_cpu_has_wasm_simd(void) { +int ggml_cpu_has_wasm_simd(void) { #if defined(__wasm_simd128__) return 1; #else @@ -18685,35 +20785,35 @@ int wsp_ggml_cpu_has_wasm_simd(void) { #endif } -int wsp_ggml_cpu_has_blas(void) { -#if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS) || defined(WSP_GGML_USE_CUBLAS) || defined(WSP_GGML_USE_CLBLAST) +int ggml_cpu_has_blas(void) { +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) return 1; #else return 0; #endif } -int wsp_ggml_cpu_has_cublas(void) { -#if defined(WSP_GGML_USE_CUBLAS) +int ggml_cpu_has_cublas(void) { +#if defined(GGML_USE_CUBLAS) return 1; #else return 0; #endif } -int wsp_ggml_cpu_has_clblast(void) { -#if defined(WSP_GGML_USE_CLBLAST) +int ggml_cpu_has_clblast(void) { +#if defined(GGML_USE_CLBLAST) return 1; #else return 0; #endif } -int wsp_ggml_cpu_has_gpublas(void) { - return wsp_ggml_cpu_has_cublas() || wsp_ggml_cpu_has_clblast(); +int ggml_cpu_has_gpublas(void) { + return ggml_cpu_has_cublas() || ggml_cpu_has_clblast(); } -int wsp_ggml_cpu_has_sse3(void) { +int ggml_cpu_has_sse3(void) { #if defined(__SSE3__) return 1; #else @@ -18721,7 +20821,7 @@ int wsp_ggml_cpu_has_sse3(void) { #endif } -int wsp_ggml_cpu_has_ssse3(void) { +int ggml_cpu_has_ssse3(void) { #if defined(__SSSE3__) return 1; #else @@ -18729,7 +20829,7 @@ int wsp_ggml_cpu_has_ssse3(void) { #endif } -int wsp_ggml_cpu_has_vsx(void) { +int ggml_cpu_has_vsx(void) { #if defined(__POWER9_VECTOR__) return 1; #else diff --git a/cpp/ggml.h b/cpp/ggml.h index 2d59f71..3220c32 100644 --- a/cpp/ggml.h +++ b/cpp/ggml.h @@ -32,22 +32,22 @@ // For example, here we define the function: f(x) = a*x^2 + b // // { -// struct wsp_ggml_init_params params = { +// struct ggml_init_params params = { // .mem_size = 16*1024*1024, // .mem_buffer = NULL, // }; // // // memory allocation happens here -// struct wsp_ggml_context * ctx = wsp_ggml_init(params); +// struct ggml_context * ctx = ggml_init(params); // -// struct wsp_ggml_tensor * x = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, 1); +// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // -// wsp_ggml_set_param(ctx, x); // x is an input variable +// ggml_set_param(ctx, x); // x is an input variable // -// struct wsp_ggml_tensor * a = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, 1); -// struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, 1); -// struct wsp_ggml_tensor * x2 = wsp_ggml_mul(ctx, x, x); -// struct wsp_ggml_tensor * f = wsp_ggml_add(ctx, wsp_ggml_mul(ctx, a, x2), b); +// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// struct ggml_tensor * x2 = ggml_mul(ctx, x, x); +// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b); // // ... // } @@ -58,33 +58,33 @@ // { // ... // -// struct wsp_ggml_cgraph gf = wsp_ggml_build_forward(f); +// struct ggml_cgraph gf = ggml_build_forward(f); // // // set the input variable and parameter values -// wsp_ggml_set_f32(x, 2.0f); -// wsp_ggml_set_f32(a, 3.0f); -// wsp_ggml_set_f32(b, 4.0f); +// ggml_set_f32(x, 2.0f); +// ggml_set_f32(a, 3.0f); +// ggml_set_f32(b, 4.0f); // -// wsp_ggml_graph_compute(ctx0, &gf); +// ggml_graph_compute_with_ctx(ctx, &gf, n_threads); // -// printf("f = %f\n", wsp_ggml_get_f32_1d(f, 0)); +// printf("f = %f\n", ggml_get_f32_1d(f, 0)); // // ... // } // -// The actual computation is performed in the wsp_ggml_graph_compute() function. +// The actual computation is performed in the ggml_graph_compute() function. // -// The wsp_ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the -// wsp_ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know +// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the +// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory -// and after defining the computation graph, call the wsp_ggml_used_mem() function to find out how much memory was +// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was // actually needed. // -// The wsp_ggml_set_param() function marks a tensor as an input variable. This is used by the automatic +// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic // differentiation and optimization algorithms. // // The described approach allows to define the function graph once and then compute its forward or backward graphs -// multiple times. All computations will use the same memory buffer allocated in the wsp_ggml_init() function. This way +// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way // the user can avoid the memory allocation overhead at runtime. // // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class @@ -95,9 +95,9 @@ // clear that the library needs to support more complex operations. The way to support these operations is not clear // yet, but a few examples are demonstrated in the following operations: // -// - wsp_ggml_permute() -// - wsp_ggml_conv_1d_1s() -// - wsp_ggml_conv_1d_2s() +// - ggml_permute() +// - ggml_conv_1d_1s() +// - ggml_conv_1d_2s() // // For each tensor operator, the library implements a forward and backward computation function. The forward function // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the @@ -108,20 +108,20 @@ // https://www.youtube.com/watch?v=wG_nF1awSSY // // -// ## Tensor data (struct wsp_ggml_tensor) +// ## Tensor data (struct ggml_tensor) // -// The tensors are stored in memory via the wsp_ggml_tensor struct. The structure provides information about the size of +// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: // // { -// struct wsp_ggml_tensor * c = wsp_ggml_add(ctx, a, b); +// struct ggml_tensor * c = ggml_add(ctx, a, b); // // assert(c->src[0] == a); // assert(c->src[1] == b); // } // -// The multi-dimensional tensors are stored in row-major order. The wsp_ggml_tensor struct contains fields for the +// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and // permutation. All tensor operations have to take the stride into account and not assume that the tensor is @@ -130,20 +130,23 @@ // The data of the tensor is accessed via the "data" pointer. For example: // // { -// struct wsp_ggml_tensor * a = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, 2, 3); +// const int nx = 2; +// const int ny = 3; // -// // a[1, 2] = 1.0f; -// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f; +// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny); // -// // a[2, 0] = 2.0f; -// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f; +// for (int y = 0; y < ny; y++) { +// for (int x = 0; x < nx; x++) { +// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y; +// } +// } // // ... // } // -// Alternatively, there are helper functions, such as wsp_ggml_get_f32_1d() and wsp_ggml_set_f32_1d() that can be used. +// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used. // -// ## The matrix multiplication operator (wsp_ggml_mul_mat) +// ## The matrix multiplication operator (ggml_mul_mat) // // TODO // @@ -169,44 +172,78 @@ // // -#ifdef WSP_GGML_SHARED +#ifdef GGML_SHARED # if defined(_WIN32) && !defined(__MINGW32__) -# ifdef WSP_GGML_BUILD -# define WSP_GGML_API __declspec(dllexport) +# ifdef GGML_BUILD +# define GGML_API __declspec(dllexport) # else -# define WSP_GGML_API __declspec(dllimport) +# define GGML_API __declspec(dllimport) # endif # else -# define WSP_GGML_API __attribute__ ((visibility ("default"))) +# define GGML_API __attribute__ ((visibility ("default"))) # endif #else -# define WSP_GGML_API +# define GGML_API +#endif + +// TODO: support for clang +#ifdef __GNUC__ +# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) +#elif defined(_MSC_VER) +# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func +#else +# define GGML_DEPRECATED(func, hint) func +#endif + +#ifndef __GNUC__ +# define GGML_ATTRIBUTE_FORMAT(...) +#elif defined(__MINGW32__) +# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) #endif #include #include #include -#define WSP_GGML_FILE_MAGIC 0x67676d6c // "ggml" -#define WSP_GGML_FILE_VERSION 1 +#define GGML_FILE_MAGIC 0x67676d6c // "ggml" +#define GGML_FILE_VERSION 1 -#define WSP_GGML_QNT_VERSION 2 // bump this on quantization format changes -#define WSP_GGML_QNT_VERSION_FACTOR 1000 // do not change this +#define GGML_QNT_VERSION 2 // bump this on quantization format changes +#define GGML_QNT_VERSION_FACTOR 1000 // do not change this -#define WSP_GGML_MAX_DIMS 4 -#define WSP_GGML_MAX_NODES 4096 -#define WSP_GGML_MAX_PARAMS 256 -#define WSP_GGML_MAX_CONTEXTS 64 -#define WSP_GGML_MAX_OPT 4 -#define WSP_GGML_MAX_NAME 48 -#define WSP_GGML_DEFAULT_N_THREADS 4 +#define GGML_MAX_DIMS 4 +#define GGML_MAX_NODES 4096 +#define GGML_MAX_PARAMS 256 +#define GGML_MAX_CONTEXTS 64 +#define GGML_MAX_SRC 6 +#define GGML_MAX_NAME 64 +#define GGML_MAX_OP_PARAMS 32 +#define GGML_DEFAULT_N_THREADS 4 -#define WSP_GGML_UNUSED(x) (void)(x) +#if UINTPTR_MAX == 0xFFFFFFFF + #define GGML_MEM_ALIGN 4 +#else + #define GGML_MEM_ALIGN 16 +#endif -#define WSP_GGML_ASSERT(x) \ +#define GGML_EXIT_SUCCESS 0 +#define GGML_EXIT_ABORTED 1 + +#define GGUF_MAGIC 0x46554747 // "GGUF" +#define GGUF_VERSION 2 + +#define GGUF_DEFAULT_ALIGNMENT 32 + +#define GGML_UNUSED(x) (void)(x) + +#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) + +#define GGML_ASSERT(x) \ do { \ if (!(x)) { \ - fprintf(stderr, "WSP_GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ abort(); \ } \ } while (0) @@ -216,239 +253,285 @@ // // example: // -// WSP_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); -// WSP_GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); +// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); +// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); // -#define WSP_GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \ +#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \ const type prefix##0 = (pointer)->array[0]; \ - WSP_GGML_UNUSED(prefix##0); -#define WSP_GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \ - WSP_GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \ + GGML_UNUSED(prefix##0); +#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \ const type prefix##1 = (pointer)->array[1]; \ - WSP_GGML_UNUSED(prefix##1); -#define WSP_GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \ - WSP_GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \ + GGML_UNUSED(prefix##1); +#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \ const type prefix##2 = (pointer)->array[2]; \ - WSP_GGML_UNUSED(prefix##2); -#define WSP_GGML_TENSOR_LOCALS(type, prefix, pointer, array) \ - WSP_GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \ + GGML_UNUSED(prefix##2); +#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \ const type prefix##3 = (pointer)->array[3]; \ - WSP_GGML_UNUSED(prefix##3); + GGML_UNUSED(prefix##3); #ifdef __cplusplus extern "C" { #endif -#ifdef __ARM_NEON - // we use the built-in 16-bit float type - typedef __fp16 wsp_ggml_fp16_t; +#if defined(__ARM_NEON) && defined(__CUDACC__) + typedef half ggml_fp16_t; +#elif defined(__ARM_NEON) + typedef __fp16 ggml_fp16_t; #else - typedef uint16_t wsp_ggml_fp16_t; + typedef uint16_t ggml_fp16_t; #endif // convert FP16 <-> FP32 - WSP_GGML_API float wsp_ggml_fp16_to_fp32(wsp_ggml_fp16_t x); - WSP_GGML_API wsp_ggml_fp16_t wsp_ggml_fp32_to_fp16(float x); - - WSP_GGML_API void wsp_ggml_fp16_to_fp32_row(const wsp_ggml_fp16_t * x, float * y, size_t n); - WSP_GGML_API void wsp_ggml_fp32_to_fp16_row(const float * x, wsp_ggml_fp16_t * y, size_t n); - - struct wsp_ggml_object; - struct wsp_ggml_context; - - enum wsp_ggml_type { - WSP_GGML_TYPE_F32 = 0, - WSP_GGML_TYPE_F16 = 1, - WSP_GGML_TYPE_Q4_0 = 2, - WSP_GGML_TYPE_Q4_1 = 3, - // WSP_GGML_TYPE_Q4_2 = 4, support has been removed - // WSP_GGML_TYPE_Q4_3 (5) support has been removed - WSP_GGML_TYPE_Q5_0 = 6, - WSP_GGML_TYPE_Q5_1 = 7, - WSP_GGML_TYPE_Q8_0 = 8, - WSP_GGML_TYPE_Q8_1 = 9, + GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); + GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x); + + GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n); + GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n); + + struct ggml_object; + struct ggml_context; + + enum ggml_type { + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, + // GGML_TYPE_Q4_2 = 4, support has been removed + // GGML_TYPE_Q4_3 (5) support has been removed + GGML_TYPE_Q5_0 = 6, + GGML_TYPE_Q5_1 = 7, + GGML_TYPE_Q8_0 = 8, + GGML_TYPE_Q8_1 = 9, // k-quantizations - WSP_GGML_TYPE_Q2_K = 10, - WSP_GGML_TYPE_Q3_K = 11, - WSP_GGML_TYPE_Q4_K = 12, - WSP_GGML_TYPE_Q5_K = 13, - WSP_GGML_TYPE_Q6_K = 14, - WSP_GGML_TYPE_Q8_K = 15, - WSP_GGML_TYPE_I8, - WSP_GGML_TYPE_I16, - WSP_GGML_TYPE_I32, - WSP_GGML_TYPE_COUNT, + GGML_TYPE_Q2_K = 10, + GGML_TYPE_Q3_K = 11, + GGML_TYPE_Q4_K = 12, + GGML_TYPE_Q5_K = 13, + GGML_TYPE_Q6_K = 14, + GGML_TYPE_Q8_K = 15, + GGML_TYPE_I8, + GGML_TYPE_I16, + GGML_TYPE_I32, + GGML_TYPE_COUNT, }; - enum wsp_ggml_backend { - WSP_GGML_BACKEND_CPU = 0, - WSP_GGML_BACKEND_GPU = 10, - WSP_GGML_BACKEND_GPU_SPLIT = 20, + enum ggml_backend { + GGML_BACKEND_CPU = 0, + GGML_BACKEND_GPU = 10, + GGML_BACKEND_GPU_SPLIT = 20, }; // model file types - enum wsp_ggml_ftype { - WSP_GGML_FTYPE_UNKNOWN = -1, - WSP_GGML_FTYPE_ALL_F32 = 0, - WSP_GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - WSP_GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - WSP_GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors - WSP_GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - WSP_GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - WSP_GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - WSP_GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors - WSP_GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors - WSP_GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors - WSP_GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors - WSP_GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors - WSP_GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors + enum ggml_ftype { + GGML_FTYPE_UNKNOWN = -1, + GGML_FTYPE_ALL_F32 = 0, + GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors + GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors + GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors }; // available tensor operations: - enum wsp_ggml_op { - WSP_GGML_OP_NONE = 0, - - WSP_GGML_OP_DUP, - WSP_GGML_OP_ADD, - WSP_GGML_OP_ADD1, - WSP_GGML_OP_ACC, - WSP_GGML_OP_SUB, - WSP_GGML_OP_MUL, - WSP_GGML_OP_DIV, - WSP_GGML_OP_SQR, - WSP_GGML_OP_SQRT, - WSP_GGML_OP_LOG, - WSP_GGML_OP_SUM, - WSP_GGML_OP_SUM_ROWS, - WSP_GGML_OP_MEAN, - WSP_GGML_OP_ARGMAX, - WSP_GGML_OP_REPEAT, - WSP_GGML_OP_REPEAT_BACK, - WSP_GGML_OP_ABS, - WSP_GGML_OP_SGN, - WSP_GGML_OP_NEG, - WSP_GGML_OP_STEP, - WSP_GGML_OP_TANH, - WSP_GGML_OP_ELU, - WSP_GGML_OP_RELU, - WSP_GGML_OP_GELU, - WSP_GGML_OP_GELU_QUICK, - WSP_GGML_OP_SILU, - WSP_GGML_OP_SILU_BACK, - WSP_GGML_OP_NORM, // normalize - WSP_GGML_OP_RMS_NORM, - WSP_GGML_OP_RMS_NORM_BACK, - - WSP_GGML_OP_MUL_MAT, - WSP_GGML_OP_OUT_PROD, - - WSP_GGML_OP_SCALE, - WSP_GGML_OP_SET, - WSP_GGML_OP_CPY, - WSP_GGML_OP_CONT, - WSP_GGML_OP_RESHAPE, - WSP_GGML_OP_VIEW, - WSP_GGML_OP_PERMUTE, - WSP_GGML_OP_TRANSPOSE, - WSP_GGML_OP_GET_ROWS, - WSP_GGML_OP_GET_ROWS_BACK, - WSP_GGML_OP_DIAG, - WSP_GGML_OP_DIAG_MASK_INF, - WSP_GGML_OP_DIAG_MASK_ZERO, - WSP_GGML_OP_SOFT_MAX, - WSP_GGML_OP_SOFT_MAX_BACK, - WSP_GGML_OP_ROPE, - WSP_GGML_OP_ROPE_BACK, - WSP_GGML_OP_ALIBI, - WSP_GGML_OP_CLAMP, - WSP_GGML_OP_CONV_1D, - WSP_GGML_OP_CONV_2D, - - WSP_GGML_OP_FLASH_ATTN, - WSP_GGML_OP_FLASH_FF, - WSP_GGML_OP_FLASH_ATTN_BACK, - WSP_GGML_OP_WIN_PART, - WSP_GGML_OP_WIN_UNPART, - - WSP_GGML_OP_MAP_UNARY, - WSP_GGML_OP_MAP_BINARY, - - WSP_GGML_OP_MAP_CUSTOM1, - WSP_GGML_OP_MAP_CUSTOM2, - WSP_GGML_OP_MAP_CUSTOM3, - - WSP_GGML_OP_CROSS_ENTROPY_LOSS, - WSP_GGML_OP_CROSS_ENTROPY_LOSS_BACK, - - WSP_GGML_OP_COUNT, + enum ggml_op { + GGML_OP_NONE = 0, + + GGML_OP_DUP, + GGML_OP_ADD, + GGML_OP_ADD1, + GGML_OP_ACC, + GGML_OP_SUB, + GGML_OP_MUL, + GGML_OP_DIV, + GGML_OP_SQR, + GGML_OP_SQRT, + GGML_OP_LOG, + GGML_OP_SUM, + GGML_OP_SUM_ROWS, + GGML_OP_MEAN, + GGML_OP_ARGMAX, + GGML_OP_REPEAT, + GGML_OP_REPEAT_BACK, + GGML_OP_CONCAT, + GGML_OP_SILU_BACK, + GGML_OP_NORM, // normalize + GGML_OP_RMS_NORM, + GGML_OP_RMS_NORM_BACK, + GGML_OP_GROUP_NORM, + + GGML_OP_MUL_MAT, + GGML_OP_OUT_PROD, + + GGML_OP_SCALE, + GGML_OP_SET, + GGML_OP_CPY, + GGML_OP_CONT, + GGML_OP_RESHAPE, + GGML_OP_VIEW, + GGML_OP_PERMUTE, + GGML_OP_TRANSPOSE, + GGML_OP_GET_ROWS, + GGML_OP_GET_ROWS_BACK, + GGML_OP_DIAG, + GGML_OP_DIAG_MASK_INF, + GGML_OP_DIAG_MASK_ZERO, + GGML_OP_SOFT_MAX, + GGML_OP_SOFT_MAX_BACK, + GGML_OP_ROPE, + GGML_OP_ROPE_BACK, + GGML_OP_ALIBI, + GGML_OP_CLAMP, + GGML_OP_CONV_1D, + GGML_OP_CONV_2D, + GGML_OP_CONV_TRANSPOSE_2D, + GGML_OP_POOL_1D, + GGML_OP_POOL_2D, + + GGML_OP_UPSCALE, // nearest interpolate + + GGML_OP_FLASH_ATTN, + GGML_OP_FLASH_FF, + GGML_OP_FLASH_ATTN_BACK, + GGML_OP_WIN_PART, + GGML_OP_WIN_UNPART, + GGML_OP_GET_REL_POS, + GGML_OP_ADD_REL_POS, + + GGML_OP_UNARY, + + GGML_OP_MAP_UNARY, + GGML_OP_MAP_BINARY, + + GGML_OP_MAP_CUSTOM1_F32, + GGML_OP_MAP_CUSTOM2_F32, + GGML_OP_MAP_CUSTOM3_F32, + + GGML_OP_MAP_CUSTOM1, + GGML_OP_MAP_CUSTOM2, + GGML_OP_MAP_CUSTOM3, + + GGML_OP_CROSS_ENTROPY_LOSS, + GGML_OP_CROSS_ENTROPY_LOSS_BACK, + + GGML_OP_COUNT, }; + enum ggml_unary_op { + GGML_UNARY_OP_ABS, + GGML_UNARY_OP_SGN, + GGML_UNARY_OP_NEG, + GGML_UNARY_OP_STEP, + GGML_UNARY_OP_TANH, + GGML_UNARY_OP_ELU, + GGML_UNARY_OP_RELU, + GGML_UNARY_OP_GELU, + GGML_UNARY_OP_GELU_QUICK, + GGML_UNARY_OP_SILU, + }; + + enum ggml_object_type { + GGML_OBJECT_TENSOR, + GGML_OBJECT_GRAPH, + GGML_OBJECT_WORK_BUFFER + }; // ggml object - struct wsp_ggml_object { + struct ggml_object { size_t offs; size_t size; - struct wsp_ggml_object * next; + struct ggml_object * next; + + enum ggml_object_type type; - char padding[8]; + char padding[4]; }; - static const size_t WSP_GGML_OBJECT_SIZE = sizeof(struct wsp_ggml_object); + static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); // n-dimensional tensor - struct wsp_ggml_tensor { - enum wsp_ggml_type type; - enum wsp_ggml_backend backend; + struct ggml_tensor { + enum ggml_type type; + enum ggml_backend backend; int n_dims; - int64_t ne[WSP_GGML_MAX_DIMS]; // number of elements - size_t nb[WSP_GGML_MAX_DIMS]; // stride in bytes: + int64_t ne[GGML_MAX_DIMS]; // number of elements + size_t nb[GGML_MAX_DIMS]; // stride in bytes: // nb[0] = sizeof(type) // nb[1] = nb[0] * ne[0] + padding // nb[i] = nb[i-1] * ne[i-1] // compute data - enum wsp_ggml_op op; + enum ggml_op op; - bool is_param; + // op params - allocated as int32_t for alignment + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; - struct wsp_ggml_tensor * grad; - struct wsp_ggml_tensor * src0; - struct wsp_ggml_tensor * src1; - struct wsp_ggml_tensor * opt[WSP_GGML_MAX_OPT]; + bool is_param; - // thread scheduling - int n_tasks; + struct ggml_tensor * grad; + struct ggml_tensor * src[GGML_MAX_SRC]; // performance int perf_runs; int64_t perf_cycles; int64_t perf_time_us; + struct ggml_tensor * view_src; + size_t view_offs; + void * data; - char name[WSP_GGML_MAX_NAME]; + char name[GGML_MAX_NAME]; void * extra; // extra things e.g. for ggml-cuda.cu char padding[4]; }; - static const size_t WSP_GGML_TENSOR_SIZE = sizeof(struct wsp_ggml_tensor); + static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + + // the compute plan that needs to be prepared for ggml_graph_compute() + // since https://github.com/ggerganov/ggml/issues/287 + struct ggml_cplan { + size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` + uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` + + int n_threads; + + // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes + int n_tasks[GGML_MAX_NODES]; + + // abort ggml_graph_compute when true + bool (*abort_callback)(void * data); + void * abort_callback_data; + }; + + // next prime after GGML_MAX_NODES + // #define GGML_GRAPH_HASHTABLE_SIZE 4099 + // next prime after GGML_MAX_NODES * 2 (nodes + leafs) + #define GGML_GRAPH_HASHTABLE_SIZE 8273 // computation graph - struct wsp_ggml_cgraph { + struct ggml_cgraph { int n_nodes; int n_leafs; - int n_threads; - size_t work_size; - struct wsp_ggml_tensor * work; + struct ggml_tensor * nodes[GGML_MAX_NODES]; + struct ggml_tensor * grads[GGML_MAX_NODES]; + struct ggml_tensor * leafs[GGML_MAX_NODES]; - struct wsp_ggml_tensor * nodes[WSP_GGML_MAX_NODES]; - struct wsp_ggml_tensor * grads[WSP_GGML_MAX_NODES]; - struct wsp_ggml_tensor * leafs[WSP_GGML_MAX_NODES]; + void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE]; // performance int perf_runs; @@ -456,14 +539,16 @@ extern "C" { int64_t perf_time_us; }; + static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph); + // scratch buffer - struct wsp_ggml_scratch { + struct ggml_scratch { size_t offs; size_t size; void * data; }; - struct wsp_ggml_init_params { + struct ggml_init_params { // memory pool size_t mem_size; // bytes void * mem_buffer; // if NULL, memory will be allocated internally @@ -475,14 +560,14 @@ extern "C" { // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled. // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995. - enum wsp_ggml_task_type { - WSP_GGML_TASK_INIT = 0, - WSP_GGML_TASK_COMPUTE, - WSP_GGML_TASK_FINALIZE, + enum ggml_task_type { + GGML_TASK_INIT = 0, + GGML_TASK_COMPUTE, + GGML_TASK_FINALIZE, }; - struct wsp_ggml_compute_params { - enum wsp_ggml_task_type type; + struct ggml_compute_params { + enum ggml_task_type type; // ith = thread index, nth = number of threads int ith, nth; @@ -494,506 +579,554 @@ extern "C" { // misc - WSP_GGML_API void wsp_ggml_time_init(void); // call this once at the beginning of the program - WSP_GGML_API int64_t wsp_ggml_time_ms(void); - WSP_GGML_API int64_t wsp_ggml_time_us(void); - WSP_GGML_API int64_t wsp_ggml_cycles(void); - WSP_GGML_API int64_t wsp_ggml_cycles_per_ms(void); + GGML_API void ggml_time_init(void); // call this once at the beginning of the program + GGML_API int64_t ggml_time_ms(void); + GGML_API int64_t ggml_time_us(void); + GGML_API int64_t ggml_cycles(void); + GGML_API int64_t ggml_cycles_per_ms(void); - WSP_GGML_API void wsp_ggml_numa_init(void); // call once for better performance on NUMA systems - WSP_GGML_API bool wsp_ggml_is_numa(void); // true if init detected that system has >1 NUMA node + GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems + GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node - WSP_GGML_API void wsp_ggml_print_object (const struct wsp_ggml_object * obj); - WSP_GGML_API void wsp_ggml_print_objects(const struct wsp_ggml_context * ctx); + GGML_API void ggml_print_object (const struct ggml_object * obj); + GGML_API void ggml_print_objects(const struct ggml_context * ctx); - WSP_GGML_API int64_t wsp_ggml_nelements (const struct wsp_ggml_tensor * tensor); - WSP_GGML_API int64_t wsp_ggml_nrows (const struct wsp_ggml_tensor * tensor); - WSP_GGML_API size_t wsp_ggml_nbytes (const struct wsp_ggml_tensor * tensor); - WSP_GGML_API size_t wsp_ggml_nbytes_split(const struct wsp_ggml_tensor * tensor, int nrows_split); + GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); + GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN + GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split); - WSP_GGML_API int wsp_ggml_blck_size (enum wsp_ggml_type type); - WSP_GGML_API size_t wsp_ggml_type_size (enum wsp_ggml_type type); // size in bytes for all elements in a block - WSP_GGML_API float wsp_ggml_type_sizef(enum wsp_ggml_type type); // wsp_ggml_type_size()/wsp_ggml_blck_size() as float + GGML_API int ggml_blck_size (enum ggml_type type); + GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block + GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float - WSP_GGML_API const char * wsp_ggml_type_name(enum wsp_ggml_type type); - WSP_GGML_API const char * wsp_ggml_op_name (enum wsp_ggml_op op); + GGML_API const char * ggml_type_name(enum ggml_type type); + GGML_API const char * ggml_op_name (enum ggml_op op); + GGML_API const char * ggml_op_symbol(enum ggml_op op); - WSP_GGML_API size_t wsp_ggml_element_size(const struct wsp_ggml_tensor * tensor); + GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); - WSP_GGML_API bool wsp_ggml_is_quantized(enum wsp_ggml_type type); + GGML_API bool ggml_is_quantized(enum ggml_type type); // TODO: temporary until model loading of ggml examples is refactored - WSP_GGML_API enum wsp_ggml_type wsp_ggml_ftype_to_wsp_ggml_type(enum wsp_ggml_ftype ftype); + GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); + + GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); + GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); + GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); - WSP_GGML_API bool wsp_ggml_is_transposed(const struct wsp_ggml_tensor * tensor); - WSP_GGML_API bool wsp_ggml_is_contiguous(const struct wsp_ggml_tensor * tensor); - WSP_GGML_API bool wsp_ggml_is_permuted (const struct wsp_ggml_tensor * tensor); + GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); // use this to compute the memory overhead of a tensor - WSP_GGML_API size_t wsp_ggml_tensor_overhead(void); + GGML_API size_t ggml_tensor_overhead(void); // main - WSP_GGML_API struct wsp_ggml_context * wsp_ggml_init(struct wsp_ggml_init_params params); - WSP_GGML_API void wsp_ggml_free(struct wsp_ggml_context * ctx); + GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); + GGML_API void ggml_free(struct ggml_context * ctx); - WSP_GGML_API size_t wsp_ggml_used_mem(const struct wsp_ggml_context * ctx); + GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); - WSP_GGML_API size_t wsp_ggml_set_scratch (struct wsp_ggml_context * ctx, struct wsp_ggml_scratch scratch); - WSP_GGML_API void wsp_ggml_set_no_alloc(struct wsp_ggml_context * ctx, bool no_alloc); + GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); + GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx); + GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); - WSP_GGML_API void * wsp_ggml_get_mem_buffer (const struct wsp_ggml_context * ctx); - WSP_GGML_API size_t wsp_ggml_get_mem_size (const struct wsp_ggml_context * ctx); - WSP_GGML_API size_t wsp_ggml_get_max_tensor_size(const struct wsp_ggml_context * ctx); + GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx); + GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx); + GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_new_tensor( - struct wsp_ggml_context * ctx, - enum wsp_ggml_type type, + GGML_API struct ggml_tensor * ggml_new_tensor( + struct ggml_context * ctx, + enum ggml_type type, int n_dims, const int64_t *ne); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_new_tensor_1d( - struct wsp_ggml_context * ctx, - enum wsp_ggml_type type, + GGML_API struct ggml_tensor * ggml_new_tensor_1d( + struct ggml_context * ctx, + enum ggml_type type, int64_t ne0); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_new_tensor_2d( - struct wsp_ggml_context * ctx, - enum wsp_ggml_type type, + GGML_API struct ggml_tensor * ggml_new_tensor_2d( + struct ggml_context * ctx, + enum ggml_type type, int64_t ne0, int64_t ne1); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_new_tensor_3d( - struct wsp_ggml_context * ctx, - enum wsp_ggml_type type, + GGML_API struct ggml_tensor * ggml_new_tensor_3d( + struct ggml_context * ctx, + enum ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_new_tensor_4d( - struct wsp_ggml_context * ctx, - enum wsp_ggml_type type, + GGML_API struct ggml_tensor * ggml_new_tensor_4d( + struct ggml_context * ctx, + enum ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_new_i32(struct wsp_ggml_context * ctx, int32_t value); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_new_f32(struct wsp_ggml_context * ctx, float value); + GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); + GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_dup_tensor (struct wsp_ggml_context * ctx, const struct wsp_ggml_tensor * src); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_view_tensor(struct wsp_ggml_context * ctx, const struct wsp_ggml_tensor * src); + GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); + GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_tensor(struct wsp_ggml_context * ctx, const char * name); + GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_zero(struct wsp_ggml_tensor * tensor); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_i32 (struct wsp_ggml_tensor * tensor, int32_t value); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_f32 (struct wsp_ggml_tensor * tensor, float value); + GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); + GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); - WSP_GGML_API int32_t wsp_ggml_get_i32_1d(const struct wsp_ggml_tensor * tensor, int i); - WSP_GGML_API void wsp_ggml_set_i32_1d(const struct wsp_ggml_tensor * tensor, int i, int32_t value); + GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); + GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); - WSP_GGML_API float wsp_ggml_get_f32_1d(const struct wsp_ggml_tensor * tensor, int i); - WSP_GGML_API void wsp_ggml_set_f32_1d(const struct wsp_ggml_tensor * tensor, int i, float value); + GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); + GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); - WSP_GGML_API void * wsp_ggml_get_data (const struct wsp_ggml_tensor * tensor); - WSP_GGML_API float * wsp_ggml_get_data_f32(const struct wsp_ggml_tensor * tensor); + GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); + GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); - WSP_GGML_API const char * wsp_ggml_get_name(const struct wsp_ggml_tensor * tensor); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_name(struct wsp_ggml_tensor * tensor, const char * name); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_format_name(struct wsp_ggml_tensor * tensor, const char * fmt, ...); + GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); + + GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); + GGML_ATTRIBUTE_FORMAT(2, 3) + GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...); // // operations on tensors with backpropagation // - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_dup( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_add( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_add_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_add1( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_add1_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_acc( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, + GGML_API struct ggml_tensor * ggml_dup( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_dup_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_add( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_add_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_add1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_add1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_acc( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_acc_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, + GGML_API struct ggml_tensor * ggml_acc_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sub( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_sub( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sub_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_sub_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_mul( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_mul( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_mul_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_mul_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_div( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_div( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_div_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_div_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sqr( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_sqr( + struct ggml_context * ctx, + struct ggml_tensor * a); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sqr_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_sqr_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sqrt( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_sqrt( + struct ggml_context * ctx, + struct ggml_tensor * a); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sqrt_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_sqrt_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_log( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_log( + struct ggml_context * ctx, + struct ggml_tensor * a); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_log_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_log_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); // return scalar - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sum( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_sum( + struct ggml_context * ctx, + struct ggml_tensor * a); // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d] - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sum_rows( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_sum_rows( + struct ggml_context * ctx, + struct ggml_tensor * a); // mean along rows - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_mean( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_mean( + struct ggml_context * ctx, + struct ggml_tensor * a); // argmax along rows - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_argmax( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_argmax( + struct ggml_context * ctx, + struct ggml_tensor * a); // if a is the same shape as b, and a is not parameter, return a // otherwise, return a new tensor: repeat(a) to fit in b - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_repeat( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_repeat_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_abs( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_abs_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sgn( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sgn_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_neg( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_neg_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_step( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_step_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_tanh( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_tanh_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_elu( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_elu_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_relu( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_relu_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_repeat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_repeat_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // concat a and b on dim 2 + // used in stable-diffusion + GGML_API struct ggml_tensor * ggml_concat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_abs( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_abs_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sgn( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sgn_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_neg( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_neg_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_step( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_step_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_tanh( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_tanh_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_elu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_elu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_relu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_relu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); // TODO: double-check this computation is correct - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_gelu( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_gelu( + struct ggml_context * ctx, + struct ggml_tensor * a); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_gelu_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_gelu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_gelu_quick( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_gelu_quick( + struct ggml_context * ctx, + struct ggml_tensor * a); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_gelu_quick_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_gelu_quick_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_silu( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_silu( + struct ggml_context * ctx, + struct ggml_tensor * a); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_silu_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_silu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); // a - x // b - dy - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_silu_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_silu_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); // normalize along rows - // TODO: eps is hardcoded to 1e-5 for now - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_norm( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_norm_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rms_norm( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rms_norm_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_rms_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_rms_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + // group normalize along ne0*ne1*n_groups + // used in stable-diffusion + // TODO: eps is hardcoded to 1e-6 for now + GGML_API struct ggml_tensor * ggml_group_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups); + + GGML_API struct ggml_tensor * ggml_group_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups); // a - x // b - dy - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rms_norm_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_rms_norm_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + float eps); // A: n columns, m rows // B: n columns, p rows (i.e. we transpose it internally) // result is m columns, p rows - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_mul_mat( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_mul_mat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); // A: m columns, n rows, // B: p columns, n rows, // result is m columns, p rows - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_out_prod( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_out_prod( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); // // operations on tensors without backpropagation // - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_scale( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_scale( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); // in-place, returns view(a) - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_scale_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_scale_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); // b -> view(a,offset,nb1,nb2,3), return modified a - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, + GGML_API struct ggml_tensor * ggml_set( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset); // b -> view(a,offset,nb1,nb2,3), return view(a) - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, + GGML_API struct ggml_tensor * ggml_set_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_1d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, + GGML_API struct ggml_tensor * ggml_set_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t offset); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_1d_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, + GGML_API struct ggml_tensor * ggml_set_1d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t offset); // b -> view(a,offset,nb1,nb2,3), return modified a - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_2d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, + GGML_API struct ggml_tensor * ggml_set_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t offset); // b -> view(a,offset,nb1,nb2,3), return view(a) - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_2d_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, + GGML_API struct ggml_tensor * ggml_set_2d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, size_t nb1, size_t offset); // a -> b, return view(b) - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cpy( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_cpy( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // a -> b, in-place, return view(b) + GGML_API struct ggml_tensor * ggml_cpy_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); // make contiguous - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // make contiguous, in-place + GGML_API struct ggml_tensor * ggml_cont_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); // return view(a), b specifies the new shape // TODO: when we start computing gradient, make a copy instead of view - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_reshape( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_reshape( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); // return view(a) // TODO: when we start computing gradient, make a copy instead of view - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_reshape_1d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_reshape_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_reshape_2d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_reshape_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0, int64_t ne1); // return view(a) // TODO: when we start computing gradient, make a copy instead of view - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_reshape_3d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_reshape_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0, int64_t ne1, int64_t ne2); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_reshape_4d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_reshape_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3); // offset in bytes - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_view_1d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_view_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0, size_t offset); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_view_2d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_view_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0, int64_t ne1, size_t nb1, // row stride in bytes size_t offset); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_view_3d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0, int64_t ne1, int64_t ne2, @@ -1001,9 +1134,9 @@ extern "C" { size_t nb2, // slice stride in bytes size_t offset); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_view_4d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_view_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, int64_t ne0, int64_t ne1, int64_t ne2, @@ -1013,138 +1146,183 @@ extern "C" { size_t nb3, size_t offset); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_permute( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_permute( + struct ggml_context * ctx, + struct ggml_tensor * a, int axis0, int axis1, int axis2, int axis3); - // alias for wsp_ggml_permute(ctx, a, 1, 0, 2, 3) - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_transpose( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + // alias for ggml_permute(ctx, a, 1, 0, 2, 3) + GGML_API struct ggml_tensor * ggml_transpose( + struct ggml_context * ctx, + struct ggml_tensor * a); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_rows( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_get_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_rows_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - struct wsp_ggml_tensor * c); + GGML_API struct ggml_tensor * ggml_get_rows_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_diag( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_diag( + struct ggml_context * ctx, + struct ggml_tensor * a); // set elements above the diagonal to -INF - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_diag_mask_inf( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_diag_mask_inf( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past); // in-place, returns view(a) - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_diag_mask_inf_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past); // set elements above the diagonal to 0 - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_diag_mask_zero( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_diag_mask_zero( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past); // in-place, returns view(a) - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_diag_mask_zero_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_soft_max( + struct ggml_context * ctx, + struct ggml_tensor * a); // in-place, returns view(a) - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_soft_max_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_soft_max_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); // in-place, returns view(a) - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_back_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_soft_max_back_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); // rotary position embedding // if mode & 1 == 1, skip n_past elements // if mode & 2 == 1, GPT-NeoX style // if mode & 4 == 1, ChatGLM style // TODO: avoid creating a new tensor every time - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_rope( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past, int n_dims, int mode, int n_ctx); // in-place, returns view(a) - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_inplace( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_rope_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past, int n_dims, int mode, int n_ctx); + // custom RoPE + GGML_API struct ggml_tensor * ggml_rope_custom( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + int mode, + int n_ctx, + float freq_base, + float freq_scale); + + // xPos RoPE, in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_xpos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past, + int n_dims, + float base, + bool down); + // rotary position embedding backward, i.e compute dx from dy // a - dy - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_rope_back( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past, int n_dims, - int mode); + int mode, + int n_ctx, + float freq_base, + float freq_scale, + float xpos_base, + bool xpos_down); // alibi position embedding // in-place, returns view(a) - struct wsp_ggml_tensor * wsp_ggml_alibi( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + struct ggml_tensor * ggml_alibi( + struct ggml_context * ctx, + struct ggml_tensor * a, int n_past, int n_head, float bias_max); // clamp // in-place, returns view(a) - struct wsp_ggml_tensor * wsp_ggml_clamp( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + struct ggml_tensor * ggml_clamp( + struct ggml_context * ctx, + struct ggml_tensor * a, float min, float max); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_1d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, + GGML_API struct ggml_tensor * ggml_conv_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, int s0, // stride int p0, // padding int d0); // dilation - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_2d( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, + // conv_1d with padding = half + // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) + GGML_API struct ggml_tensor* ggml_conv_1d_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s, + int d); + + GGML_API struct ggml_tensor * ggml_conv_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, int s0, int s1, int p0, @@ -1152,37 +1330,93 @@ extern "C" { int d0, int d1); - // conv_1d with padding = half - // alias for wsp_ggml_conv_1d(a, b, s, a->ne[0]/2, d) - WSP_GGML_API struct wsp_ggml_tensor* wsp_ggml_conv_1d_ph( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - int s, - int d); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_attn( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * q, - struct wsp_ggml_tensor * k, - struct wsp_ggml_tensor * v, + // kernel size is a->ne[0] x a->ne[1] + // stride is equal to kernel size + // padding is zero + // example: + // a: 16 16 3 768 + // b: 1024 1024 3 1 + // res: 64 64 768 1 + // used in sam + GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // kernel size is a->ne[0] x a->ne[1] + // stride is 1 + // padding is half + // example: + // a: 3 3 256 256 + // b: 64 64 256 1 + // res: 64 64 256 1 + // used in sam + GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride); + + enum ggml_op_pool { + GGML_OP_POOL_MAX, + GGML_OP_POOL_AVG, + GGML_OP_POOL_COUNT, + }; + + GGML_API struct ggml_tensor * ggml_pool_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, // kernel size + int s0, // stride + int p0); // padding + + GGML_API struct ggml_tensor * ggml_pool_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int k1, + int s0, + int s1, + int p0, + int p1); + + // nearest interpolate + // used in stable-diffusion + GGML_API struct ggml_tensor * ggml_upscale( + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor); + + GGML_API struct ggml_tensor * ggml_flash_attn( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, bool masked); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_attn_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * q, - struct wsp_ggml_tensor * k, - struct wsp_ggml_tensor * v, - struct wsp_ggml_tensor * d, + GGML_API struct ggml_tensor * ggml_flash_attn_back( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * d, bool masked); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_ff( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b0, - struct wsp_ggml_tensor * b1, - struct wsp_ggml_tensor * c0, - struct wsp_ggml_tensor * c1); + GGML_API struct ggml_tensor * ggml_flash_ff( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b0, + struct ggml_tensor * b1, + struct ggml_tensor * c0, + struct ggml_tensor * c1); // partition into non-overlapping windows with padding if needed // example: @@ -1190,167 +1424,280 @@ extern "C" { // w: 14 // res: 768 14 14 25 // used in sam - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_win_part( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_win_part( + struct ggml_context * ctx, + struct ggml_tensor * a, int w); - // reverse of wsp_ggml_win_part + // reverse of ggml_win_part // used in sam - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_win_unpart( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, + GGML_API struct ggml_tensor * ggml_win_unpart( + struct ggml_context * ctx, + struct ggml_tensor * a, int w0, int h0, int w); + GGML_API struct ggml_tensor * ggml_unary( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op); + + GGML_API struct ggml_tensor * ggml_unary_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op); + + // used in sam + GGML_API struct ggml_tensor * ggml_get_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + int qh, + int kh); + + // used in sam + + GGML_API struct ggml_tensor * ggml_add_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph); + + GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph); + // custom operators - typedef void (*wsp_ggml_unary_op_f32_t) (const int, float *, const float *); - typedef void (*wsp_ggml_binary_op_f32_t)(const int, float *, const float *, const float *); - - typedef void (*wsp_ggml_custom1_op_f32_t)(struct wsp_ggml_tensor *, const struct wsp_ggml_tensor *); - typedef void (*wsp_ggml_custom2_op_f32_t)(struct wsp_ggml_tensor *, const struct wsp_ggml_tensor *, const struct wsp_ggml_tensor *); - typedef void (*wsp_ggml_custom3_op_f32_t)(struct wsp_ggml_tensor *, const struct wsp_ggml_tensor *, const struct wsp_ggml_tensor *, const struct wsp_ggml_tensor *); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_unary_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - wsp_ggml_unary_op_f32_t fun); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_unary_inplace_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - wsp_ggml_unary_op_f32_t fun); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_binary_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - wsp_ggml_binary_op_f32_t fun); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_binary_inplace_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - wsp_ggml_binary_op_f32_t fun); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom1_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - wsp_ggml_custom1_op_f32_t fun); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom1_inplace_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - wsp_ggml_custom1_op_f32_t fun); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom2_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - wsp_ggml_custom2_op_f32_t fun); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom2_inplace_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - wsp_ggml_custom2_op_f32_t fun); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom3_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - struct wsp_ggml_tensor * c, - wsp_ggml_custom3_op_f32_t fun); - - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom3_inplace_f32( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - struct wsp_ggml_tensor * c, - wsp_ggml_custom3_op_f32_t fun); + typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); + typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); + + typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_unary_op_f32_t fun), + "use ggml_map_custom1 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_unary_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_binary_op_f32_t fun), + "use ggml_map_custom2 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_binary_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_f32_t fun), + "use ggml_map_custom1_inplace instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_f32_t fun), + "use ggml_map_custom2_inplace instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3 instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_f32_t fun), + "use ggml_map_custom3_inplace instead"); + + // custom operators v2 + + typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata); + typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); + typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); + + #define GGML_N_TASKS_MAX -1 + + GGML_API struct ggml_tensor * ggml_map_custom1( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); // loss function - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cross_entropy_loss( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_cross_entropy_loss( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cross_entropy_loss_back( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * a, - struct wsp_ggml_tensor * b, - struct wsp_ggml_tensor * c); + GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c); // // automatic differentiation // - WSP_GGML_API void wsp_ggml_set_param( - struct wsp_ggml_context * ctx, - struct wsp_ggml_tensor * tensor); + GGML_API void ggml_set_param( + struct ggml_context * ctx, + struct ggml_tensor * tensor); + + + GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); + + GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); + GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); - WSP_GGML_API void wsp_ggml_build_forward_expand(struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_tensor * tensor); + // graph allocation in a context + GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); + GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor); + GGML_API size_t ggml_graph_overhead(void); - WSP_GGML_API struct wsp_ggml_cgraph wsp_ggml_build_forward (struct wsp_ggml_tensor * tensor); - WSP_GGML_API struct wsp_ggml_cgraph wsp_ggml_build_backward(struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * gf, bool keep); + // ggml_graph_plan() has to be called before ggml_graph_compute() + // when plan.work_size > 0, caller must allocate memory for plan.work_data + GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); - WSP_GGML_API void wsp_ggml_graph_compute(struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * cgraph); - WSP_GGML_API void wsp_ggml_graph_reset (struct wsp_ggml_cgraph * cgraph); + // same as ggml_graph_compute() but the work data is allocated as a part of the context + // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data + GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); - WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_graph_get_tensor(struct wsp_ggml_cgraph * cgraph, const char * name); + GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); - WSP_GGML_API void wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * fname); - WSP_GGML_API struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml_context ** ctx_data, struct wsp_ggml_context ** ctx_eval); + GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); + GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); // print info and performance information for the graph - WSP_GGML_API void wsp_ggml_graph_print(const struct wsp_ggml_cgraph * cgraph); + GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); // dump the graph into a file using the dot format - WSP_GGML_API void wsp_ggml_graph_dump_dot(const struct wsp_ggml_cgraph * gb, const struct wsp_ggml_cgraph * gf, const char * filename); + GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); // // optimization // // optimization methods - enum wsp_ggml_opt_type { - WSP_GGML_OPT_ADAM, - WSP_GGML_OPT_LBFGS, + enum ggml_opt_type { + GGML_OPT_ADAM, + GGML_OPT_LBFGS, }; // linesearch methods - enum wsp_ggml_linesearch { - WSP_GGML_LINESEARCH_DEFAULT = 1, + enum ggml_linesearch { + GGML_LINESEARCH_DEFAULT = 1, - WSP_GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0, - WSP_GGML_LINESEARCH_BACKTRACKING_WOLFE = 1, - WSP_GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, + GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0, + GGML_LINESEARCH_BACKTRACKING_WOLFE = 1, + GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, }; // optimization return values - enum wsp_ggml_opt_result { - WSP_GGML_OPT_OK = 0, - WSP_GGML_OPT_DID_NOT_CONVERGE, - WSP_GGML_OPT_NO_CONTEXT, - WSP_GGML_OPT_INVALID_WOLFE, - WSP_GGML_OPT_FAIL, - - WSP_GGML_LINESEARCH_FAIL = -128, - WSP_GGML_LINESEARCH_MINIMUM_STEP, - WSP_GGML_LINESEARCH_MAXIMUM_STEP, - WSP_GGML_LINESEARCH_MAXIMUM_ITERATIONS, - WSP_GGML_LINESEARCH_INVALID_PARAMETERS, + enum ggml_opt_result { + GGML_OPT_OK = 0, + GGML_OPT_DID_NOT_CONVERGE, + GGML_OPT_NO_CONTEXT, + GGML_OPT_INVALID_WOLFE, + GGML_OPT_FAIL, + + GGML_LINESEARCH_FAIL = -128, + GGML_LINESEARCH_MINIMUM_STEP, + GGML_LINESEARCH_MAXIMUM_STEP, + GGML_LINESEARCH_MAXIMUM_ITERATIONS, + GGML_LINESEARCH_INVALID_PARAMETERS, }; + typedef void (*ggml_opt_callback)(void * data, float * sched); + // optimization parameters // - // see ggml.c (wsp_ggml_opt_default_params) for default values + // see ggml.c (ggml_opt_default_params) for default values // - struct wsp_ggml_opt_params { - enum wsp_ggml_opt_type type; + struct ggml_opt_params { + enum ggml_opt_type type; int n_threads; @@ -1380,12 +1727,14 @@ extern "C" { float sched; // schedule multiplier (fixed, decay or warmup) float decay; // weight decay for AdamW, use 0.0f to disable + int decay_min_ndim; // minimum number of tensor dimension to apply weight decay float alpha; // learning rate float beta1; float beta2; float eps; // epsilon for numerical stability float eps_f; // epsilon for convergence test float eps_g; // epsilon for convergence test + float gclip; // gradient clipping } adam; // LBFGS parameters @@ -1400,44 +1749,42 @@ extern "C" { float min_step; float max_step; - enum wsp_ggml_linesearch linesearch; + enum ggml_linesearch linesearch; } lbfgs; }; - struct wsp_ggml_opt_context { - struct wsp_ggml_context * ctx; - struct wsp_ggml_opt_params params; + struct ggml_opt_context { + struct ggml_context * ctx; + struct ggml_opt_params params; int iter; int64_t nx; // number of parameter elements bool just_initialized; + float loss_before; + float loss_after; + struct { - struct wsp_ggml_tensor * x; // view of the parameters - struct wsp_ggml_tensor * g1; // gradient - struct wsp_ggml_tensor * g2; // gradient squared - struct wsp_ggml_tensor * m; // first moment - struct wsp_ggml_tensor * v; // second moment - struct wsp_ggml_tensor * mh; // first moment hat - struct wsp_ggml_tensor * vh; // second moment hat - struct wsp_ggml_tensor * pf; // past function values + struct ggml_tensor * m; // first moment + struct ggml_tensor * v; // second moment + struct ggml_tensor * pf; // past function values float fx_best; float fx_prev; int n_no_improvement; } adam; struct { - struct wsp_ggml_tensor * x; // current parameters - struct wsp_ggml_tensor * xp; // previous parameters - struct wsp_ggml_tensor * g; // current gradient - struct wsp_ggml_tensor * gp; // previous gradient - struct wsp_ggml_tensor * d; // search direction - struct wsp_ggml_tensor * pf; // past function values - struct wsp_ggml_tensor * lmal; // the L-BFGS memory alpha - struct wsp_ggml_tensor * lmys; // the L-BFGS memory ys - struct wsp_ggml_tensor * lms; // the L-BFGS memory s - struct wsp_ggml_tensor * lmy; // the L-BFGS memory y + struct ggml_tensor * x; // current parameters + struct ggml_tensor * xp; // previous parameters + struct ggml_tensor * g; // current gradient + struct ggml_tensor * gp; // previous gradient + struct ggml_tensor * d; // search direction + struct ggml_tensor * pf; // past function values + struct ggml_tensor * lmal; // the L-BFGS memory alpha + struct ggml_tensor * lmys; // the L-BFGS memory ys + struct ggml_tensor * lms; // the L-BFGS memory s + struct ggml_tensor * lmy; // the L-BFGS memory y float fx_best; float step; int j; @@ -1447,95 +1794,222 @@ extern "C" { } lbfgs; }; - WSP_GGML_API struct wsp_ggml_opt_params wsp_ggml_opt_default_params(enum wsp_ggml_opt_type type); + GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); // optimize the function defined by the tensor f - WSP_GGML_API enum wsp_ggml_opt_result wsp_ggml_opt( - struct wsp_ggml_context * ctx, - struct wsp_ggml_opt_params params, - struct wsp_ggml_tensor * f); + GGML_API enum ggml_opt_result ggml_opt( + struct ggml_context * ctx, + struct ggml_opt_params params, + struct ggml_tensor * f); // initialize optimizer context - WSP_GGML_API void wsp_ggml_opt_init( - struct wsp_ggml_context * ctx, - struct wsp_ggml_opt_context * opt, - struct wsp_ggml_opt_params params, - int64_t nx); + GGML_API void ggml_opt_init( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_opt_params params, + int64_t nx); // continue optimizing the function defined by the tensor f - WSP_GGML_API enum wsp_ggml_opt_result wsp_ggml_opt_resume( - struct wsp_ggml_context * ctx, - struct wsp_ggml_opt_context * opt, - struct wsp_ggml_tensor * f); + GGML_API enum ggml_opt_result ggml_opt_resume( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f); // continue optimizing the function defined by the tensor f - WSP_GGML_API enum wsp_ggml_opt_result wsp_ggml_opt_resume_g( - struct wsp_ggml_context * ctx, - struct wsp_ggml_opt_context * opt, - struct wsp_ggml_tensor * f, - struct wsp_ggml_cgraph * gf, - struct wsp_ggml_cgraph * gb); + GGML_API enum ggml_opt_result ggml_opt_resume_g( + struct ggml_context * ctx, + struct ggml_opt_context * opt, + struct ggml_tensor * f, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data); // // quantization // - WSP_GGML_API size_t wsp_ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); - WSP_GGML_API size_t wsp_ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); - WSP_GGML_API size_t wsp_ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); - WSP_GGML_API size_t wsp_ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); - WSP_GGML_API size_t wsp_ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); + + GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + + // + // gguf + // + + enum gguf_type { + GGUF_TYPE_UINT8 = 0, + GGUF_TYPE_INT8 = 1, + GGUF_TYPE_UINT16 = 2, + GGUF_TYPE_INT16 = 3, + GGUF_TYPE_UINT32 = 4, + GGUF_TYPE_INT32 = 5, + GGUF_TYPE_FLOAT32 = 6, + GGUF_TYPE_BOOL = 7, + GGUF_TYPE_STRING = 8, + GGUF_TYPE_ARRAY = 9, + GGUF_TYPE_UINT64 = 10, + GGUF_TYPE_INT64 = 11, + GGUF_TYPE_FLOAT64 = 12, + GGUF_TYPE_COUNT, // marks the end of the enum + }; + + struct gguf_context; + + struct gguf_init_params { + bool no_alloc; + + // if not NULL, create a ggml_context and allocate the tensor data in it + struct ggml_context ** ctx; + }; + + GGML_API struct gguf_context * gguf_init_empty(void); + GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); + //GGML_API struct gguf_context * gguf_init_from_buffer(..); + + GGML_API void gguf_free(struct gguf_context * ctx); + + GGML_API const char * gguf_type_name(enum gguf_type type); + + GGML_API int gguf_get_version (const struct gguf_context * ctx); + GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx); + GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); + GGML_API void * gguf_get_data (const struct gguf_context * ctx); + + GGML_API int gguf_get_n_kv(const struct gguf_context * ctx); + GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key); + GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i); + + GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i); + GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i); + + // results are undefined if the wrong type is used for the key + GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i); + GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i); + GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i); + GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i); + GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i); + GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i); + GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i); + GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i); + GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i); + GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i); + GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i); + GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i); + GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i); + GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i); + GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i); + + GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); + GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); + GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); + GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); + + // overrides existing values or adds a new one + GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); + GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val); + GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val); + GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val); + GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val); + GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val); + GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val); + GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val); + GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val); + GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val); + GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val); + GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val); + GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n); + GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n); + + // set or add KV pairs from another context + GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src); + + // manage tensor info + GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); + GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type); + GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size); + + // writing gguf files can be done in 2 ways: + // + // - write the entire gguf_context to a binary file in a single pass: + // + // gguf_write_to_file(ctx, fname); + // + // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data: + // + // FILE * f = fopen(fname, "wb"); + // fseek(f, gguf_get_meta_size(ctx), SEEK_SET); + // fwrite(f, ...); + // void * data = gguf_meta_get_meta_data(ctx); + // fseek(f, 0, SEEK_SET); + // fwrite(f, data, gguf_get_meta_size(ctx)); + // free(data); + // fclose(f); + // - WSP_GGML_API size_t wsp_ggml_quantize_chunk(enum wsp_ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + // write the entire context to a binary file + GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); + + // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding + GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); + GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); // // system info // - WSP_GGML_API int wsp_ggml_cpu_has_avx (void); - WSP_GGML_API int wsp_ggml_cpu_has_avx2 (void); - WSP_GGML_API int wsp_ggml_cpu_has_avx512 (void); - WSP_GGML_API int wsp_ggml_cpu_has_avx512_vbmi(void); - WSP_GGML_API int wsp_ggml_cpu_has_avx512_vnni(void); - WSP_GGML_API int wsp_ggml_cpu_has_fma (void); - WSP_GGML_API int wsp_ggml_cpu_has_neon (void); - WSP_GGML_API int wsp_ggml_cpu_has_arm_fma (void); - WSP_GGML_API int wsp_ggml_cpu_has_f16c (void); - WSP_GGML_API int wsp_ggml_cpu_has_fp16_va (void); - WSP_GGML_API int wsp_ggml_cpu_has_wasm_simd (void); - WSP_GGML_API int wsp_ggml_cpu_has_blas (void); - WSP_GGML_API int wsp_ggml_cpu_has_cublas (void); - WSP_GGML_API int wsp_ggml_cpu_has_clblast (void); - WSP_GGML_API int wsp_ggml_cpu_has_gpublas (void); - WSP_GGML_API int wsp_ggml_cpu_has_sse3 (void); - WSP_GGML_API int wsp_ggml_cpu_has_ssse3 (void); - WSP_GGML_API int wsp_ggml_cpu_has_vsx (void); + GGML_API int ggml_cpu_has_avx (void); + GGML_API int ggml_cpu_has_avx2 (void); + GGML_API int ggml_cpu_has_avx512 (void); + GGML_API int ggml_cpu_has_avx512_vbmi(void); + GGML_API int ggml_cpu_has_avx512_vnni(void); + GGML_API int ggml_cpu_has_fma (void); + GGML_API int ggml_cpu_has_neon (void); + GGML_API int ggml_cpu_has_arm_fma (void); + GGML_API int ggml_cpu_has_metal (void); + GGML_API int ggml_cpu_has_f16c (void); + GGML_API int ggml_cpu_has_fp16_va (void); + GGML_API int ggml_cpu_has_wasm_simd (void); + GGML_API int ggml_cpu_has_blas (void); + GGML_API int ggml_cpu_has_cublas (void); + GGML_API int ggml_cpu_has_clblast (void); + GGML_API int ggml_cpu_has_gpublas (void); + GGML_API int ggml_cpu_has_sse3 (void); + GGML_API int ggml_cpu_has_ssse3 (void); + GGML_API int ggml_cpu_has_vsx (void); // // Internal types and functions exposed for tests and benchmarks // #ifdef __cplusplus - // restrict not standard in C++ -#define WSP_GGML_RESTRICT +// restrict not standard in C++ +#define GGML_RESTRICT #else -#define WSP_GGML_RESTRICT restrict +#define GGML_RESTRICT restrict #endif - typedef void (*dequantize_row_q_t)(const void * WSP_GGML_RESTRICT x, float * WSP_GGML_RESTRICT y, int k); - typedef void (*quantize_row_q_t) (const float * WSP_GGML_RESTRICT x, void * WSP_GGML_RESTRICT y, int k); - typedef void (*vec_dot_q_t) (const int n, float * WSP_GGML_RESTRICT s, const void * WSP_GGML_RESTRICT x, const void * WSP_GGML_RESTRICT y); + typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); + typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); typedef struct { - dequantize_row_q_t dequantize_row_q; - quantize_row_q_t quantize_row_q; - quantize_row_q_t quantize_row_q_reference; - quantize_row_q_t quantize_row_q_dot; - vec_dot_q_t vec_dot_q; - enum wsp_ggml_type vec_dot_type; - } quantize_fns_t; - - quantize_fns_t wsp_ggml_internal_get_quantize_fn(size_t i); + const char * type_name; + int blck_size; + size_t type_size; + bool is_quantized; + ggml_to_float_t to_float; + ggml_from_float_t from_float; + ggml_from_float_t from_float_reference; + ggml_vec_dot_t vec_dot; + enum ggml_type vec_dot_type; + } ggml_type_traits_t; + + ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); #ifdef __cplusplus } -#endif +#endif \ No newline at end of file diff --git a/cpp/whisper.cpp b/cpp/whisper.cpp index be83206..2f0f58f 100644 --- a/cpp/whisper.cpp +++ b/cpp/whisper.cpp @@ -3,11 +3,16 @@ #include "coreml/whisper-encoder.h" #endif -#if WHISPER_USE_OPENVINO +#ifdef GGML_USE_METAL +# include "ggml-metal.h" +#endif + +#ifdef WHISPER_USE_OPENVINO #include "openvino/whisper-openvino-encoder.h" #endif #include "ggml.h" +#include "ggml-alloc.h" #include #include @@ -18,17 +23,19 @@ #include #include #include +#include #include #include #include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif -#if defined(WSP_GGML_BIG_ENDIAN) +#if defined(GGML_BIG_ENDIAN) #include template @@ -42,28 +49,28 @@ float byteswap(float value) { } template -static void byteswap_tensor_data(wsp_ggml_tensor * tensor) { +static void byteswap_tensor_data(ggml_tensor * tensor) { T * datum = reinterpret_cast(tensor->data); - for (int i = 0; i < wsp_ggml_nelements(tensor); i++) { + for (int i = 0; i < ggml_nelements(tensor); i++) { datum[i] = byteswap(datum[i]); } } -static void byteswap_tensor(wsp_ggml_tensor * tensor) { +static void byteswap_tensor(ggml_tensor * tensor) { switch (tensor->type) { - case WSP_GGML_TYPE_I16: { + case GGML_TYPE_I16: { byteswap_tensor_data(tensor); break; } - case WSP_GGML_TYPE_F16: { - byteswap_tensor_data(tensor); + case GGML_TYPE_F16: { + byteswap_tensor_data(tensor); break; } - case WSP_GGML_TYPE_I32: { + case GGML_TYPE_I32: { byteswap_tensor_data(tensor); break; } - case WSP_GGML_TYPE_F32: { + case GGML_TYPE_F32: { byteswap_tensor_data(tensor); break; } @@ -114,8 +121,58 @@ static void byteswap_tensor(wsp_ggml_tensor * tensor) { //#define WHISPER_USE_FLASH_FF #define WHISPER_MAX_DECODERS 16 -#define WHISPER_USE_SCRATCH -#define WHISPER_MAX_SCRATCH_BUFFERS 16 +// +// ggml helpers +// + +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + +// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad" +// the idea is to represent the original matrix multiplication: +// +// Z = X @ Y +// +// with the sum of two matrix multiplications: +// +// Z = (X_0 @ Y_0) + (X_1 @ Y_1) +// +// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad" +// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more +// general-purpose kernels +// +static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y, int pad = 32) { + // use padding only if dimension 0 is at least 8 times larger than the padding + // else we won't get much benefit from the optimization + const int n_pad_req = 8; + + if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) { + return ggml_mul_mat(ctx, x, y); + } + + struct ggml_tensor * x_0 = ggml_view_3d(ctx, x, (x->ne[0]/pad)*pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0); + struct ggml_tensor * x_1 = ggml_view_3d(ctx, x, x->ne[0]%pad, x->ne[1], x->ne[2], x->nb[1], x->nb[2], x_0->ne[0]*x_0->nb[0]); + + struct ggml_tensor * y_0 = ggml_view_3d(ctx, y, (y->ne[0]/pad)*pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], 0); + struct ggml_tensor * y_1 = ggml_view_3d(ctx, y, y->ne[0]%pad, y->ne[1], y->ne[2], y->nb[1], y->nb[2], y_0->ne[0]*y_0->nb[0]); + + return ggml_add(ctx, + ggml_mul_mat(ctx, x_0, y_0), + ggml_mul_mat(ctx, x_1, y_1)); +} + +// TODO: check if other platforms can benefit from this optimization +#if defined(GGML_USE_METAL) +#define ggml_mul_mat ggml_mul_mat_pad +#endif // available whisper models enum e_model { @@ -231,40 +288,9 @@ static const std::map> g_lang = { static const size_t MB = 1ull*1024*1024; -static const std::map MEM_REQ_SCRATCH0 = { - { MODEL_TINY, 62ull*MB }, - { MODEL_BASE, 80ull*MB }, - { MODEL_SMALL, 120ull*MB }, - { MODEL_MEDIUM, 158ull*MB }, - { MODEL_LARGE, 198ull*MB }, -}; - -static const std::map MEM_REQ_SCRATCH1 = { - { MODEL_TINY, 18ull*MB }, - { MODEL_BASE, 24ull*MB }, - { MODEL_SMALL, 36ull*MB }, - { MODEL_MEDIUM, 48ull*MB }, - { MODEL_LARGE, 60ull*MB }, -}; - -static const std::map MEM_REQ_SCRATCH2 = { - { MODEL_TINY, 4ull*MB }, - { MODEL_BASE, 4ull*MB }, - { MODEL_SMALL, 6ull*MB }, - { MODEL_MEDIUM, 7ull*MB }, - { MODEL_LARGE, 9ull*MB }, -}; - -static const std::map MEM_REQ_SCRATCH3 = { - { MODEL_TINY, 4ull*MB }, - { MODEL_BASE, 4ull*MB }, - { MODEL_SMALL, 6ull*MB }, - { MODEL_MEDIUM, 7ull*MB }, - { MODEL_LARGE, 9ull*MB }, -}; - -static const std::map> MEM_REQ_MODEL = { - { WSP_GGML_TYPE_F32, +// TODO: avoid using GGUF +static const std::map> MEM_REQ_MODEL = { + { GGML_TYPE_F32, { { MODEL_TINY, 74ull*MB }, { MODEL_BASE, 142ull*MB }, @@ -273,7 +299,7 @@ static const std::map> MEM_REQ_MODEL = { MODEL_LARGE, 2952ull*MB }, }, }, - { WSP_GGML_TYPE_F16, + { GGML_TYPE_F16, { { MODEL_TINY, 74ull*MB }, { MODEL_BASE, 142ull*MB }, @@ -282,7 +308,7 @@ static const std::map> MEM_REQ_MODEL = { MODEL_LARGE, 2952ull*MB }, }, }, - { WSP_GGML_TYPE_Q4_0, + { GGML_TYPE_Q4_0, { { MODEL_TINY, 26ull*MB }, { MODEL_BASE, 50ull*MB }, @@ -291,7 +317,7 @@ static const std::map> MEM_REQ_MODEL = { MODEL_LARGE, 940ull*MB }, }, }, - { WSP_GGML_TYPE_Q4_1, + { GGML_TYPE_Q4_1, { { MODEL_TINY, 32ull*MB }, { MODEL_BASE, 58ull*MB }, @@ -300,7 +326,7 @@ static const std::map> MEM_REQ_MODEL = { MODEL_LARGE, 1124ull*MB }, }, }, - { WSP_GGML_TYPE_Q5_0, + { GGML_TYPE_Q5_0, { { MODEL_TINY, 30ull*MB }, { MODEL_BASE, 54ull*MB }, @@ -309,7 +335,7 @@ static const std::map> MEM_REQ_MODEL = { MODEL_LARGE, 1034ull*MB }, }, }, - { WSP_GGML_TYPE_Q5_1, + { GGML_TYPE_Q5_1, { { MODEL_TINY, 32ull*MB }, { MODEL_BASE, 58ull*MB }, @@ -318,7 +344,7 @@ static const std::map> MEM_REQ_MODEL = { MODEL_LARGE, 1124ull*MB }, }, }, - { WSP_GGML_TYPE_Q8_0, + { GGML_TYPE_Q8_0, { { MODEL_TINY, 45ull*MB }, { MODEL_BASE, 84ull*MB }, @@ -329,38 +355,6 @@ static const std::map> MEM_REQ_MODEL = }, }; -static const std::map MEM_REQ_KV_SELF = { - { MODEL_TINY, 3ull*MB }, - { MODEL_BASE, 6ull*MB }, - { MODEL_SMALL, 16ull*MB }, - { MODEL_MEDIUM, 43ull*MB }, - { MODEL_LARGE, 71ull*MB }, -}; - -static const std::map MEM_REQ_KV_CROSS = { - { MODEL_TINY, 9ull*MB }, - { MODEL_BASE, 18ull*MB }, - { MODEL_SMALL, 53ull*MB }, - { MODEL_MEDIUM, 141ull*MB }, - { MODEL_LARGE, 235ull*MB }, -}; - -static const std::map MEM_REQ_ENCODE = { - { MODEL_TINY, 30ull*MB }, - { MODEL_BASE, 38ull*MB }, - { MODEL_SMALL, 56ull*MB }, - { MODEL_MEDIUM, 74ull*MB }, - { MODEL_LARGE, 94ull*MB }, -}; - -static const std::map MEM_REQ_DECODE = { - { MODEL_TINY, 3ull*MB }, - { MODEL_BASE, 5ull*MB }, - { MODEL_SMALL, 10ull*MB }, - { MODEL_MEDIUM, 18ull*MB }, - { MODEL_LARGE, 27ull*MB }, -}; - struct whisper_mel { int n_len; int n_len_org; @@ -441,101 +435,103 @@ struct whisper_hparams { int32_t n_text_layer = 4; int32_t n_mels = 80; int32_t ftype = 1; + float eps = 1e-5f; }; // audio encoding layer struct whisper_layer_encoder { // encoder.blocks.*.attn_ln - struct wsp_ggml_tensor * attn_ln_0_w; - struct wsp_ggml_tensor * attn_ln_0_b; + struct ggml_tensor * attn_ln_0_w; + struct ggml_tensor * attn_ln_0_b; // encoder.blocks.*.attn.out - struct wsp_ggml_tensor * attn_ln_1_w; - struct wsp_ggml_tensor * attn_ln_1_b; + struct ggml_tensor * attn_ln_1_w; + struct ggml_tensor * attn_ln_1_b; // encoder.blocks.*.attn.query - struct wsp_ggml_tensor * attn_q_w; - struct wsp_ggml_tensor * attn_q_b; + struct ggml_tensor * attn_q_w; + struct ggml_tensor * attn_q_b; // encoder.blocks.*.attn.key - struct wsp_ggml_tensor * attn_k_w; + struct ggml_tensor * attn_k_w; // encoder.blocks.*.attn.value - struct wsp_ggml_tensor * attn_v_w; - struct wsp_ggml_tensor * attn_v_b; + struct ggml_tensor * attn_v_w; + struct ggml_tensor * attn_v_b; // encoder.blocks.*.mlp_ln - struct wsp_ggml_tensor * mlp_ln_w; - struct wsp_ggml_tensor * mlp_ln_b; + struct ggml_tensor * mlp_ln_w; + struct ggml_tensor * mlp_ln_b; // encoder.blocks.*.mlp.0 - struct wsp_ggml_tensor * mlp_0_w; - struct wsp_ggml_tensor * mlp_0_b; + struct ggml_tensor * mlp_0_w; + struct ggml_tensor * mlp_0_b; // encoder.blocks.*.mlp.2 - struct wsp_ggml_tensor * mlp_1_w; - struct wsp_ggml_tensor * mlp_1_b; + struct ggml_tensor * mlp_1_w; + struct ggml_tensor * mlp_1_b; }; // token decoding layer struct whisper_layer_decoder { // decoder.blocks.*.attn_ln - struct wsp_ggml_tensor * attn_ln_0_w; - struct wsp_ggml_tensor * attn_ln_0_b; + struct ggml_tensor * attn_ln_0_w; + struct ggml_tensor * attn_ln_0_b; // decoder.blocks.*.attn.out - struct wsp_ggml_tensor * attn_ln_1_w; - struct wsp_ggml_tensor * attn_ln_1_b; + struct ggml_tensor * attn_ln_1_w; + struct ggml_tensor * attn_ln_1_b; // decoder.blocks.*.attn.query - struct wsp_ggml_tensor * attn_q_w; - struct wsp_ggml_tensor * attn_q_b; + struct ggml_tensor * attn_q_w; + struct ggml_tensor * attn_q_b; // decoder.blocks.*.attn.key - struct wsp_ggml_tensor * attn_k_w; + struct ggml_tensor * attn_k_w; // decoder.blocks.*.attn.value - struct wsp_ggml_tensor * attn_v_w; - struct wsp_ggml_tensor * attn_v_b; + struct ggml_tensor * attn_v_w; + struct ggml_tensor * attn_v_b; // decoder.blocks.*.cross_attn_ln - struct wsp_ggml_tensor * cross_attn_ln_0_w; - struct wsp_ggml_tensor * cross_attn_ln_0_b; + struct ggml_tensor * cross_attn_ln_0_w; + struct ggml_tensor * cross_attn_ln_0_b; // decoder.blocks.*.cross_attn.out - struct wsp_ggml_tensor * cross_attn_ln_1_w; - struct wsp_ggml_tensor * cross_attn_ln_1_b; + struct ggml_tensor * cross_attn_ln_1_w; + struct ggml_tensor * cross_attn_ln_1_b; // decoder.blocks.*.cross_attn.query - struct wsp_ggml_tensor * cross_attn_q_w; - struct wsp_ggml_tensor * cross_attn_q_b; + struct ggml_tensor * cross_attn_q_w; + struct ggml_tensor * cross_attn_q_b; // decoder.blocks.*.cross_attn.key - struct wsp_ggml_tensor * cross_attn_k_w; + struct ggml_tensor * cross_attn_k_w; // decoder.blocks.*.cross_attn.value - struct wsp_ggml_tensor * cross_attn_v_w; - struct wsp_ggml_tensor * cross_attn_v_b; + struct ggml_tensor * cross_attn_v_w; + struct ggml_tensor * cross_attn_v_b; // decoder.blocks.*.mlp_ln - struct wsp_ggml_tensor * mlp_ln_w; - struct wsp_ggml_tensor * mlp_ln_b; + struct ggml_tensor * mlp_ln_w; + struct ggml_tensor * mlp_ln_b; // decoder.blocks.*.mlp.0 - struct wsp_ggml_tensor * mlp_0_w; - struct wsp_ggml_tensor * mlp_0_b; + struct ggml_tensor * mlp_0_w; + struct ggml_tensor * mlp_0_b; // decoder.blocks.*.mlp.2 - struct wsp_ggml_tensor * mlp_1_w; - struct wsp_ggml_tensor * mlp_1_b; + struct ggml_tensor * mlp_1_w; + struct ggml_tensor * mlp_1_b; }; struct whisper_kv_cache { - struct wsp_ggml_tensor * k; - struct wsp_ggml_tensor * v; + struct ggml_tensor * k; + struct ggml_tensor * v; - struct wsp_ggml_context * ctx; + struct ggml_context * ctx; + // buf points to the memory allocated for both ggml_tensor 'k' and 'v' (see kv_cache_init) std::vector buf; int n; // number of tokens currently in the cache @@ -548,42 +544,42 @@ struct whisper_model { whisper_filters filters; // encoder.positional_embedding - struct wsp_ggml_tensor * e_pe; + struct ggml_tensor * e_pe; // encoder.conv1 - struct wsp_ggml_tensor * e_conv_1_w; - struct wsp_ggml_tensor * e_conv_1_b; + struct ggml_tensor * e_conv_1_w; + struct ggml_tensor * e_conv_1_b; // encoder.conv2 - struct wsp_ggml_tensor * e_conv_2_w; - struct wsp_ggml_tensor * e_conv_2_b; + struct ggml_tensor * e_conv_2_w; + struct ggml_tensor * e_conv_2_b; // encoder.ln_post - struct wsp_ggml_tensor * e_ln_w; - struct wsp_ggml_tensor * e_ln_b; + struct ggml_tensor * e_ln_w; + struct ggml_tensor * e_ln_b; // decoder.positional_embedding - struct wsp_ggml_tensor * d_pe; + struct ggml_tensor * d_pe; // decoder.token_embedding - struct wsp_ggml_tensor * d_te; + struct ggml_tensor * d_te; // decoder.ln - struct wsp_ggml_tensor * d_ln_w; - struct wsp_ggml_tensor * d_ln_b; + struct ggml_tensor * d_ln_w; + struct ggml_tensor * d_ln_b; std::vector layers_encoder; std::vector layers_decoder; // context - struct wsp_ggml_context * ctx; + struct ggml_context * ctx; // the model memory buffer is read-only and can be shared between processors std::vector * buf; // tensors int n_loaded; - std::map tensors; + std::map tensors; }; struct whisper_sequence { @@ -601,7 +597,7 @@ struct whisper_sequence { // TAGS: WHISPER_DECODER_INIT struct whisper_decoder { - // each decoders keeps its own KV-cache + // each decoder keeps its own KV-cache whisper_kv_cache kv_self; // the currently generated sequence of tokens @@ -621,15 +617,75 @@ struct whisper_decoder { std::vector tokens_tmp; // used for whisper_decode calls }; +// replace std::pair by using customized pair struct (reason: std::pair is very slow) +template +struct whisper_pair { + A first; + B second; + + // Define a constructor that takes two arguments. + whisper_pair(const A& a, const B& b) : first(a), second(b) {} + // Define a constructor that takes no argument. + whisper_pair() : first(A()), second(B()) {} +}; + +// beam-search helpers +struct kv_buf { + std::vector k; + std::vector v; +}; + +// ggml_allocr wrapper for whisper usage +struct whisper_allocr { + ggml_allocr * alloc = nullptr; + + std::vector meta; + std::vector data; +}; + +static size_t whisper_allocr_size(struct whisper_allocr & allocr) { + return allocr.meta.size() + allocr.data.size(); +} + +// measure the memory usage of a graph and prepare the allocr's internal data buffer +static void whisper_allocr_graph_init(struct whisper_allocr & allocr, std::function && get_graph) { + const int tensor_alignment = 32; + + auto & alloc = allocr.alloc; + auto & meta = allocr.meta; + auto & data = allocr.data; + + meta.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); + + alloc = ggml_allocr_new_measure(tensor_alignment); + + const size_t alloc_size = ggml_allocr_alloc_graph(alloc, get_graph()) + tensor_alignment; + + ggml_allocr_free(alloc); + + data.resize(alloc_size); + + alloc = ggml_allocr_new(data.data(), data.size(), tensor_alignment); +} + +static void whisper_allocr_free(struct whisper_allocr & allocr) { + if (allocr.alloc) { + ggml_allocr_free(allocr.alloc); + allocr.alloc = nullptr; + } +} + struct whisper_state { int64_t t_sample_us = 0; int64_t t_encode_us = 0; int64_t t_decode_us = 0; + int64_t t_prompt_us = 0; int64_t t_mel_us = 0; int32_t n_sample = 0; // number of tokens sampled int32_t n_encode = 0; // number of encoder calls - int32_t n_decode = 0; // number of decoder calls + int32_t n_decode = 0; // number of decoder calls with n_tokens == 1 (text-generation) + int32_t n_prompt = 0; // number of decoder calls with n_tokens > 1 (prompt encoding) int32_t n_fail_p = 0; // number of logprob threshold failures int32_t n_fail_h = 0; // number of entropy threshold failures @@ -640,12 +696,23 @@ struct whisper_state { whisper_decoder decoders[WHISPER_MAX_DECODERS] = {}; - // memory buffers used by encode / decode contexts - std::vector buf_compute; - std::vector buf_scratch[WHISPER_MAX_SCRATCH_BUFFERS]; + // buffer for swapping KV caches between decoders during beam-search + std::vector kv_swap_bufs; + + // reusable buffer for `struct ggml_graph_plan.work_data` + std::vector work_buffer; - int buf_last = 0; - size_t buf_max_size[WHISPER_MAX_SCRATCH_BUFFERS] = { 0 }; + // ggml-alloc: + // - stores meta info about the intermediate tensors into the `meta` buffers + // - stores the actual tensor data into the `data` buffers + whisper_allocr alloc_conv; + whisper_allocr alloc_encode; + whisper_allocr alloc_cross; + whisper_allocr alloc_decode; + + // result of the encoder + struct ggml_tensor * embd_conv = nullptr; + struct ggml_tensor * embd_enc = nullptr; // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; @@ -654,7 +721,7 @@ struct whisper_state { std::vector prompt_past; // work container used to avoid memory allocations - std::vector> logits_id; + std::vector> logits_id; mutable std::mt19937 rng; // used for sampling at t > 0.0 @@ -665,6 +732,10 @@ struct whisper_state { whisper_coreml_context * ctx_coreml = nullptr; #endif +#ifdef GGML_USE_METAL + ggml_metal_context * ctx_metal = nullptr; +#endif + #ifdef WHISPER_USE_OPENVINO whisper_openvino_context * ctx_openvino = nullptr; #endif @@ -677,45 +748,14 @@ struct whisper_state { // [EXPERIMENTAL] speed-up techniques int32_t exp_n_audio_ctx = 0; // 0 - use default - - void use_buf(struct wsp_ggml_context * ctx, int i) { -#if defined(WHISPER_USE_SCRATCH) - size_t last_size = 0; - - if (i == -1) { - last_size = wsp_ggml_set_scratch(ctx, { 0, 0, nullptr, }); - } else { - auto & buf = buf_scratch[i]; - last_size = wsp_ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), }); - } - - if (buf_last >= 0) { - buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size); - } - - buf_last = i; -#else - (void) i; - (void) ctx; -#endif - } - - size_t get_buf_max_mem(int i) const { -#if defined(WHISPER_USE_SCRATCH) - return buf_max_size[i]; -#else - (void) i; - return 0; -#endif - } }; struct whisper_context { int64_t t_load_us = 0; int64_t t_start_us = 0; - wsp_ggml_type wtype = wsp_ggml_type::WSP_GGML_TYPE_F16; // weight type (FP32 / FP16 / QX) - wsp_ggml_type itype = wsp_ggml_type::WSP_GGML_TYPE_F16; // intermediate type (FP32 or FP16) + ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX) + ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16) whisper_model model; whisper_vocab vocab; @@ -730,6 +770,13 @@ static void whisper_default_log(const char * text) { static whisper_log_callback whisper_log = whisper_default_log; +#ifdef __GNUC__ +#ifdef __MINGW32__ +__attribute__((gnu_format(printf, 1, 2))) +#else +__attribute__((format(printf, 1, 2))) +#endif +#endif static void log(const char * fmt, ...) { if (!whisper_log) return; char buf[1024]; @@ -747,33 +794,34 @@ static void read_safe(whisper_model_loader * loader, T & dest) { static bool kv_cache_init( const struct whisper_hparams & hparams, - const size_t mem_bytes, struct whisper_kv_cache & cache, - wsp_ggml_type wtype, + ggml_type wtype, int n_ctx) { + const int64_t n_text_state = hparams.n_text_state; + const int64_t n_text_layer = hparams.n_text_layer; + + const int64_t n_mem = n_text_layer*n_ctx; + const int64_t n_elements = n_text_state*n_mem; + + const size_t mem_bytes = 2*(ggml_type_size(wtype)*n_elements + ggml_tensor_overhead()); + cache.buf.resize(mem_bytes); - struct wsp_ggml_init_params params = { + struct ggml_init_params params = { /*.mem_size =*/ cache.buf.size(), /*.mem_buffer =*/ cache.buf.data(), /*.no_alloc =*/ false, }; - cache.ctx = wsp_ggml_init(params); + cache.ctx = ggml_init(params); if (!cache.ctx) { log("%s: failed to allocate memory for kv cache\n", __func__); return false; } - const int n_text_state = hparams.n_text_state; - const int n_text_layer = hparams.n_text_layer; - - const int n_mem = n_text_layer*n_ctx; - const int n_elements = n_text_state*n_mem; - - cache.k = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - cache.v = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); return true; } @@ -781,36 +829,36 @@ static bool kv_cache_init( static bool kv_cache_reinit(struct whisper_kv_cache & cache) { WHISPER_ASSERT(cache.ctx); - const int n_elements = wsp_ggml_nelements(cache.k); - WHISPER_ASSERT(n_elements == wsp_ggml_nelements(cache.v)); + const int n_elements = ggml_nelements(cache.k); + WHISPER_ASSERT(n_elements == ggml_nelements(cache.v)); - const wsp_ggml_type wtype = cache.k->type; + const ggml_type wtype = cache.k->type; WHISPER_ASSERT(wtype == cache.v->type); - WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*wsp_ggml_type_sizef(wtype)); + WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype)); - struct wsp_ggml_init_params params = { + struct ggml_init_params params = { /*.mem_size =*/ cache.buf.size(), /*.mem_buffer =*/ cache.buf.data(), /*.no_alloc =*/ false, }; - cache.ctx = wsp_ggml_init(params); + cache.ctx = ggml_init(params); if (!cache.ctx) { log("%s: failed to allocate memory for kv cache\n", __func__); return false; } - cache.k = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - cache.v = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); return true; } static void kv_cache_free(struct whisper_kv_cache & cache) { if (cache.ctx) { - wsp_ggml_free(cache.ctx); + ggml_free(cache.ctx); cache.ctx = nullptr; } } @@ -829,7 +877,7 @@ static void kv_cache_free(struct whisper_kv_cache & cache) { static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) { log("%s: loading model\n", __func__); - const int64_t t_start_us = wsp_ggml_time_us(); + const int64_t t_start_us = ggml_time_us(); wctx.t_start_us = t_start_us; @@ -840,7 +888,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con { uint32_t magic; read_safe(loader, magic); - if (magic != WSP_GGML_FILE_MAGIC) { + if (magic != GGML_FILE_MAGIC) { log("%s: invalid model data (bad magic)\n", __func__); return false; } @@ -884,14 +932,14 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con model.type = e_model::MODEL_LARGE; } - const int32_t qntvr = hparams.ftype / WSP_GGML_QNT_VERSION_FACTOR; + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - hparams.ftype %= WSP_GGML_QNT_VERSION_FACTOR; + hparams.ftype %= GGML_QNT_VERSION_FACTOR; // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation - wctx.wtype = wsp_ggml_ftype_to_wsp_ggml_type((wsp_ggml_ftype) (model.hparams.ftype)); - if (wctx.wtype == WSP_GGML_TYPE_COUNT) { + wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wctx.wtype == GGML_TYPE_COUNT) { log("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype); return false; } @@ -914,22 +962,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con // print memory requirements { - // this is the total memory required to run the inference - const size_t mem_required = - MEM_REQ_SCRATCH0.at(model.type) + - MEM_REQ_SCRATCH1.at(model.type) + - MEM_REQ_SCRATCH2.at(model.type) + - MEM_REQ_SCRATCH3.at(model.type) + - scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type) + - scale*MEM_REQ_KV_CROSS.at(model.type) + - scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)); - - // this is the memory required by one decoder - const size_t mem_required_decoder = - scale*MEM_REQ_KV_SELF.at(model.type); - - log("%s: mem required = %7.2f MB (+ %7.2f MB per decoder)\n", __func__, - mem_required / 1024.0 / 1024.0, mem_required_decoder / 1024.0 / 1024.0); + // TODO + //log("%s: mem required = %7.2f MB (+ %7.2f MB per decoder)\n", __func__, + // mem_required / 1024.0 / 1024.0, mem_required_decoder / 1024.0 / 1024.0); } // initialize all memory buffers @@ -1033,8 +1068,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con size_t ctx_size = 0; - const wsp_ggml_type wtype = wctx.wtype; - const wsp_ggml_type vtype = wctx.wtype == WSP_GGML_TYPE_F32 ? WSP_GGML_TYPE_F32 : WSP_GGML_TYPE_F16; // conv type + const ggml_type wtype = wctx.wtype; + const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type { const auto & hparams = model.hparams; @@ -1053,92 +1088,92 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con // encoder { - ctx_size += n_audio_ctx*n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_pe; + ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe; - ctx_size += 3*n_mels*n_audio_state*wsp_ggml_type_sizef(vtype); // e_conv_1_w - ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_conv_1_b + ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(vtype); // e_conv_1_w + ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b - ctx_size += 3*n_audio_state*n_audio_state*wsp_ggml_type_sizef(vtype); // e_conv_2_w - ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_conv_2_b + ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(vtype); // e_conv_2_w + ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b - ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_ln_w; - ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_ln_b; + ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w; + ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b; } // decoder { - ctx_size += n_text_ctx*n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // d_pe; + ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe; - ctx_size += n_vocab*n_text_state*wsp_ggml_type_sizef(wtype); // d_te; + ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te; - ctx_size += n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // d_ln_w; - ctx_size += n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // d_ln_b; + ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w; + ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b; } // encoder layers { - ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_w - ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_b + ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w + ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b - ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // mlp_0_w - ctx_size += n_audio_layer*( 4*n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_0_b + ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_0_w + ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b - ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // mlp_1_w - ctx_size += n_audio_layer*( n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_1_b + ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_1_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b - ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_w - ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_b + ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w + ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b - ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // attn_q_w - ctx_size += n_audio_layer*( n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_q_b + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_q_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b - ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // attn_k_w + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w - ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // attn_v_w - ctx_size += n_audio_layer*( n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_v_b + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_v_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b - ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // attn_ln_1_w - ctx_size += n_audio_layer*( n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_1_b + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_ln_1_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b } // decoder layers { - ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_w - ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_b + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b - ctx_size += n_text_layer*(4*n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // mlp_0_w - ctx_size += n_text_layer*( 4*n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_0_b + ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_0_w + ctx_size += n_text_layer*( 4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b - ctx_size += n_text_layer*(4*n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // mlp_1_w - ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_1_b + ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_1_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b - ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_w - ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_b + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b - ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // attn_q_w - ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_q_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_q_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b - ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // attn_k_w + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w - ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // attn_v_w - ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_v_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_v_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b - ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // attn_ln_1_w - ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_1_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_ln_1_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b // - ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_ln_0_w - ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_ln_0_b + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b - ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // cross_attn_q_w - ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_q_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_q_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b - ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // cross_attn_k_w + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w - ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // cross_attn_v_w - ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_v_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_v_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b - ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // cross_attn_ln_1_w - ctx_size += n_text_layer*( n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_ln_1_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_ln_1_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b } ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead @@ -1148,15 +1183,15 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con // create the ggml context { - struct wsp_ggml_init_params params = { + struct ggml_init_params params = { /*.mem_size =*/ wctx.model.buf->size(), /*.mem_buffer =*/ wctx.model.buf->data(), /*.no_alloc =*/ false, }; - model.ctx = wsp_ggml_init(params); + model.ctx = ggml_init(params); if (!model.ctx) { - log("%s: wsp_ggml_init() failed\n", __func__); + log("%s: ggml_init() failed\n", __func__); return false; } } @@ -1184,16 +1219,16 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con // encoder { - model.e_pe = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, n_audio_state, n_audio_ctx); + model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx); - model.e_conv_1_w = wsp_ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state); - model.e_conv_1_b = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, 1, n_audio_state); + model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state); + model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state); - model.e_conv_2_w = wsp_ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state); - model.e_conv_2_b = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, 1, n_audio_state); + model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state); + model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state); - model.e_ln_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state); - model.e_ln_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state); + model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); // map by name model.tensors["encoder.positional_embedding"] = model.e_pe; @@ -1210,28 +1245,28 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con for (int i = 0; i < n_audio_layer; ++i) { auto & layer = model.layers_encoder[i]; - layer.mlp_ln_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state); - layer.mlp_ln_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state); + layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - layer.mlp_0_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state); - layer.mlp_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, 4*n_audio_state); + layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state); + layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state); - layer.mlp_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state); - layer.mlp_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state); + layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state); + layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - layer.attn_ln_0_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state); - layer.attn_ln_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state); + layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - layer.attn_q_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); - layer.attn_q_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state); + layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); + layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - layer.attn_k_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); + layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); - layer.attn_v_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); - layer.attn_v_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state); + layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); + layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - layer.attn_ln_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); - layer.attn_ln_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state); + layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); + layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); // map by name model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w; @@ -1261,12 +1296,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con // decoder { - model.d_pe = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, n_text_state, n_text_ctx); + model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx); - model.d_te = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab); + model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab); - model.d_ln_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); - model.d_ln_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); + model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); // map by name model.tensors["decoder.positional_embedding"] = model.d_pe; @@ -1279,42 +1314,42 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con for (int i = 0; i < n_text_layer; ++i) { auto & layer = model.layers_decoder[i]; - layer.mlp_ln_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); - layer.mlp_ln_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); + layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.mlp_0_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state); - layer.mlp_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, 4*n_text_state); + layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state); + layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state); - layer.mlp_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state); - layer.mlp_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); + layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state); + layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.attn_ln_0_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); - layer.attn_ln_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); + layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.attn_q_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.attn_q_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); + layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.attn_k_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.attn_v_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.attn_v_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); + layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.attn_ln_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.attn_ln_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); + layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.cross_attn_ln_0_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); - layer.cross_attn_ln_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); + layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.cross_attn_q_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.cross_attn_q_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); + layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.cross_attn_k_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.cross_attn_v_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.cross_attn_v_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); + layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.cross_attn_ln_1_w = wsp_ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.cross_attn_ln_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state); + layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); // map by name model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w; @@ -1394,7 +1429,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con } auto tensor = model.tensors[name.data()]; - if (wsp_ggml_nelements(tensor) != nelements) { + if (ggml_nelements(tensor) != nelements) { log("%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); log("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n", __func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]); @@ -1407,19 +1442,19 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con return false; } - const size_t bpe = wsp_ggml_type_size(wsp_ggml_type(ttype)); + const size_t bpe = ggml_type_size(ggml_type(ttype)); - if ((nelements*bpe)/wsp_ggml_blck_size(tensor->type) != wsp_ggml_nbytes(tensor)) { + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { log("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.data(), wsp_ggml_nbytes(tensor), nelements*bpe); + __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); return false; } - loader->read(loader->context, tensor->data, wsp_ggml_nbytes(tensor)); + loader->read(loader->context, tensor->data, ggml_nbytes(tensor)); BYTESWAP_TENSOR(tensor); - //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], wsp_ggml_type_name((wsp_ggml_type) ttype), wsp_ggml_nbytes(tensor)/1024.0/1024.0); - total_size += wsp_ggml_nbytes(tensor); + //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0); + total_size += ggml_nbytes(tensor); model.n_loaded++; } @@ -1433,56 +1468,63 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con } } - wctx.t_load_us = wsp_ggml_time_us() - t_start_us; + wctx.t_load_us = ggml_time_us() - t_start_us; return true; } -// evaluate the encoder with the given state -// -// given audio recording (more specifically, its log mel spectrogram), runs forward pass of the encoder -// part of the transformer model and returns the encoded features -// -// - wctx: the model -// - wstate: the state of the encoder -// - n_threads: number of threads to use -// - mel_offset: offset in the mel spectrogram (i.e. audio offset) -// -static bool whisper_encode_internal( - whisper_context & wctx, - whisper_state & wstate, - const int mel_offset, - const int n_threads){ +static bool whisper_encode_external(const whisper_state & wstate) { + GGML_UNUSED(wstate); - const int64_t t_start_us = wsp_ggml_time_us(); +#ifndef WHISPER_USE_COREML + const bool use_coreml = false; +#else + const bool use_coreml = wstate.ctx_coreml != nullptr; +#endif + +#ifndef WHISPER_USE_OPENVINO + const bool use_openvino = false; +#else + const bool use_openvino = wstate.ctx_openvino != nullptr; +#endif + + return use_coreml || use_openvino; +} +static struct ggml_cgraph * whisper_build_graph_conv( + whisper_context & wctx, + whisper_state & wstate, + const int mel_offset) { const auto & model = wctx.model; const auto & mel_inp = wstate.mel; const auto & hparams = model.hparams; const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx; - const int n_state = hparams.n_audio_state; - const int n_head = hparams.n_audio_head; - const int n_layer = hparams.n_audio_layer; + const int n_state = hparams.n_audio_state; GGML_UNUSED(n_state); const int n_mels = hparams.n_mels; - assert(mel_inp.n_mel == n_mels); - struct wsp_ggml_init_params params = { - /*.mem_size =*/ wstate.buf_compute.size(), - /*.mem_buffer =*/ wstate.buf_compute.data(), - /*.no_alloc =*/ false, + struct ggml_init_params params = { + /*.mem_size =*/ wstate.alloc_conv.meta.size(), + /*.mem_buffer =*/ wstate.alloc_conv.meta.data(), + /*.no_alloc =*/ true, }; - struct wsp_ggml_context * ctx0 = wsp_ggml_init(params); + struct ggml_context * ctx0 = ggml_init(params); - wstate.use_buf(ctx0, 0); + ggml_cgraph * gf = ggml_new_graph(ctx0); + + ggml_allocr * alloc = wstate.alloc_conv.alloc; + + struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels); + ggml_allocr_alloc(alloc, mel); + + assert(mel->type == GGML_TYPE_F32); + if (!ggml_allocr_is_measure(alloc)) { + assert(mel_inp.n_mel == n_mels); - struct wsp_ggml_tensor * mel = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, 2*n_ctx, n_mels); - assert(mel->type == WSP_GGML_TYPE_F32); - { float * dst = (float *) mel->data; - memset(dst, 0, wsp_ggml_nbytes(mel)); + memset(dst, 0, ggml_nbytes(mel)); const int i0 = std::min(mel_offset, mel_inp.n_len); const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len); @@ -1494,440 +1536,469 @@ static bool whisper_encode_internal( } } - struct wsp_ggml_tensor * cur; - -#ifndef WHISPER_USE_COREML - const bool use_coreml = false; -#else - const bool use_coreml = wstate.ctx_coreml != nullptr; -#endif - -#ifndef WHISPER_USE_OPENVINO - const bool use_openvino = false; -#else - const bool use_openvino = wstate.ctx_openvino != nullptr; -#endif + struct ggml_tensor * cur = nullptr; - if (!use_coreml && !use_openvino) { + if (!whisper_encode_external(wstate)) { // convolution + gelu { - wstate.use_buf(ctx0, 1); - - cur = wsp_ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1); - cur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, + cur = ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1); + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.e_conv_1_b, cur), cur); - cur = wsp_ggml_gelu(ctx0, cur); - - wstate.use_buf(ctx0, 0); + cur = ggml_gelu(ctx0, cur); - cur = wsp_ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1); - cur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, + cur = ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1); + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.e_conv_2_b, cur), cur); - cur = wsp_ggml_gelu(ctx0, cur); + cur = ggml_gelu(ctx0, cur); } - wstate.use_buf(ctx0, 3); + wstate.embd_conv = cur; + } else { +#ifdef WHISPER_USE_COREML + cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx); + ggml_allocr_alloc(alloc, cur); - // =================================================================== - // NOTE: experimenting with partial evaluation of the encoder (ignore) - //static int iter = -1; - //const int n_iter = 1500/n_ctx; + if (!ggml_allocr_is_measure(alloc)) { + whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data); + } +#endif +#ifdef WHISPER_USE_OPENVINO + cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx); + ggml_allocr_alloc(alloc, cur); - //iter = (iter + 1) % n_iter; + if (!ggml_allocr_is_measure(alloc)) { + whisper_openvino_encode(wstate.ctx_openvino, mel, cur); + } +#endif - //if (iter == 0) { - // memset(model.memory_cross_k->data, 0, wsp_ggml_nbytes(model.memory_cross_k)); - // memset(model.memory_cross_v->data, 0, wsp_ggml_nbytes(model.memory_cross_v)); - //} + wstate.embd_enc = cur; + } + + ggml_build_forward_expand(gf, cur); + + ggml_free(ctx0); + + return gf; +} + +static struct ggml_cgraph * whisper_build_graph_encoder( + whisper_context & wctx, + whisper_state & wstate) { + const auto & model = wctx.model; + const auto & hparams = model.hparams; - static int iter = 0; + const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx; + const int n_state = hparams.n_audio_state; + const int n_head = hparams.n_audio_head; + const int n_layer = hparams.n_audio_layer; - const size_t e_pe_stride = model.e_pe->ne[0]*wsp_ggml_element_size(model.e_pe); - const size_t e_pe_offset = model.e_pe->ne[0]*wsp_ggml_element_size(model.e_pe)*n_ctx*iter; + struct ggml_init_params params = { + /*.mem_size =*/ wstate.alloc_encode.meta.size(), + /*.mem_buffer =*/ wstate.alloc_encode.meta.data(), + /*.no_alloc =*/ true, + }; - struct wsp_ggml_tensor * e_pe = wsp_ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset); + struct ggml_context * ctx0 = ggml_init(params); - cur = wsp_ggml_add(ctx0, e_pe, wsp_ggml_transpose(ctx0, cur)); + ggml_cgraph * gf = ggml_new_graph(ctx0); - // =================================================================== + ggml_allocr * alloc = wstate.alloc_encode.alloc; - // original: - //cur = wsp_ggml_add(ctx0, model.e_pe, wsp_ggml_transpose(ctx0, cur)); + struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(alloc, KQscale); - struct wsp_ggml_tensor * inpL = cur; + if (!ggml_allocr_is_measure(alloc)) { + ggml_set_f32(KQscale, 1.0f/sqrt(float(n_state)/n_head)); + } - for (int il = 0; il < n_layer; ++il) { - const auto & layer = model.layers_encoder[il]; + struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv); - // norm - { - wstate.use_buf(ctx0, 0); + // =================================================================== + // NOTE: experimenting with partial evaluation of the encoder (ignore) + //static int iter = -1; + //const int n_iter = 1500/n_ctx; - cur = wsp_ggml_norm(ctx0, inpL); + //iter = (iter + 1) % n_iter; - // cur = ln_0_w*cur + ln_0_b - cur = wsp_ggml_add(ctx0, - wsp_ggml_mul(ctx0, - wsp_ggml_repeat(ctx0, layer.attn_ln_0_w, cur), - cur), - wsp_ggml_repeat(ctx0, layer.attn_ln_0_b, cur)); - } + //if (iter == 0) { + // memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k)); + // memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v)); + //} - // self-attention - { - wstate.use_buf(ctx0, 1); + static int iter = 0; - struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0, - layer.attn_q_w, - cur); + const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe); + const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter; - Qcur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, - layer.attn_q_b, - Qcur), - Qcur); + struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset); - //Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + cur = ggml_add(ctx0, e_pe, ggml_cont(ctx0, ggml_transpose(ctx0, cur))); - // note: no bias for Key - struct wsp_ggml_tensor * Kcur = wsp_ggml_mul_mat(ctx0, - layer.attn_k_w, - cur); + // =================================================================== - //Kcur = wsp_ggml_scale_inplace(ctx0, Kcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + // original: + //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur)); - struct wsp_ggml_tensor * Vcur = wsp_ggml_mul_mat(ctx0, - layer.attn_v_w, - cur); + struct ggml_tensor * inpL = cur; - Vcur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, - layer.attn_v_b, - Vcur), - Vcur); + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers_encoder[il]; - // ------ + // norm + { + cur = ggml_norm(ctx0, inpL, hparams.eps); + + // cur = ln_0_w*cur + ln_0_b + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, layer.attn_ln_0_w), + layer.attn_ln_0_b); + } + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, + layer.attn_q_w, + cur); + + Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b); - wstate.use_buf(ctx0, 0); + //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + + // note: no bias for Key + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, + layer.attn_k_w, + cur); + + //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, + layer.attn_v_w, + cur); + + Vcur = ggml_add(ctx0, Vcur, layer.attn_v_b); + + // ------ #ifdef WHISPER_USE_FLASH_ATTN - struct wsp_ggml_tensor * Q = - wsp_ggml_permute(ctx0, - wsp_ggml_cpy(ctx0, - Qcur, - wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)), - 0, 2, 1, 3); - - struct wsp_ggml_tensor * K = - wsp_ggml_permute(ctx0, - wsp_ggml_cpy(ctx0, - Kcur, - wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)), - 0, 2, 1, 3); - - struct wsp_ggml_tensor * V = - wsp_ggml_cpy(ctx0, - wsp_ggml_permute(ctx0, - wsp_ggml_reshape_3d(ctx0, - Vcur, - n_state/n_head, n_head, n_ctx), - 1, 2, 0, 3), - wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)); - - struct wsp_ggml_tensor * KQV = wsp_ggml_flash_attn(ctx0, Q, K, V, false); + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)), + 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Kcur, + ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)), + 0, 2, 1, 3); + + struct ggml_tensor * V = + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + Vcur, + n_state/n_head, n_head, n_ctx), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)); + + struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false); #else - struct wsp_ggml_tensor * Q = - wsp_ggml_permute(ctx0, - wsp_ggml_cpy(ctx0, - Qcur, - wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)), - 0, 2, 1, 3); - - struct wsp_ggml_tensor * K = - wsp_ggml_permute(ctx0, - wsp_ggml_cpy(ctx0, - Kcur, - wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)), - 0, 2, 1, 3); - - // K * Q - struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q); - - struct wsp_ggml_tensor * KQ_scaled = - wsp_ggml_scale_inplace(ctx0, - KQ, - wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head)) - ); - - struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ_scaled); - - struct wsp_ggml_tensor * V = - wsp_ggml_cpy(ctx0, - wsp_ggml_permute(ctx0, - wsp_ggml_reshape_3d(ctx0, - Vcur, - n_state/n_head, n_head, n_ctx), - 1, 2, 0, 3), - wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head) - ); - - struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max); + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)), + 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Kcur, + ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)), + 0, 2, 1, 3); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQscale); + + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled); + + struct ggml_tensor * V = + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + Vcur, + n_state/n_head, n_head, n_ctx), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head) + ); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); #endif - struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3); + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx)); + } - wstate.use_buf(ctx0, 1); + // projection + { + cur = ggml_mul_mat(ctx0, + layer.attn_ln_1_w, + cur); - cur = wsp_ggml_cpy(ctx0, - KQV_merged, - wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx)); - } + cur = ggml_add(ctx0, cur, layer.attn_ln_1_b); + } - // projection - { - wstate.use_buf(ctx0, 0); + // add the input + cur = ggml_add(ctx0, cur, inpL); - cur = wsp_ggml_mul_mat(ctx0, - layer.attn_ln_1_w, - cur); + struct ggml_tensor * inpFF = cur; - wstate.use_buf(ctx0, 1); + // feed-forward network + { + // norm + { + cur = ggml_norm(ctx0, inpFF, hparams.eps); - cur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, layer.attn_ln_1_b, cur), - cur); + // cur = mlp_ln_w*cur + mlp_ln_b + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, layer.mlp_ln_w), + layer.mlp_ln_b); } - wstate.use_buf(ctx0, 2); +#ifdef WHISPER_USE_FLASH_FF + cur = ggml_flash_ff(ctx0, + ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)), + layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b); +#else + // fully connected + cur = ggml_mul_mat(ctx0, + layer.mlp_0_w, + cur); + + cur = ggml_add(ctx0, cur, layer.mlp_0_b); - // add the input - cur = wsp_ggml_add(ctx0, cur, inpL); + // GELU activation + cur = ggml_gelu(ctx0, cur); - struct wsp_ggml_tensor * inpFF = cur; + // projection + cur = ggml_mul_mat(ctx0, + layer.mlp_1_w, + cur); - // feed-forward network - { - // norm - { - wstate.use_buf(ctx0, 0); + cur = ggml_add(ctx0, cur, layer.mlp_1_b); +#endif + } - cur = wsp_ggml_norm(ctx0, inpFF); + inpL = ggml_add(ctx0, cur, inpFF); + } - wstate.use_buf(ctx0, 1); + cur = inpL; - // cur = mlp_ln_w*cur + mlp_ln_b - cur = wsp_ggml_add(ctx0, - wsp_ggml_mul(ctx0, - wsp_ggml_repeat(ctx0, layer.mlp_ln_w, cur), - cur), - wsp_ggml_repeat(ctx0, layer.mlp_ln_b, cur)); - } + // norm + { + cur = ggml_norm(ctx0, cur, hparams.eps); -#ifdef WHISPER_USE_FLASH_FF - wstate.use_buf(ctx0, 0); + // cur = ln_f_g*cur + ln_f_b + cur = ggml_add(ctx0, + ggml_mul(ctx0, cur, model.e_ln_w), + model.e_ln_b); + } - cur = wsp_ggml_flash_ff(ctx0, - wsp_ggml_cpy(ctx0, cur, wsp_ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)), - layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b); -#else - wstate.use_buf(ctx0, 0); + ggml_build_forward_expand(gf, cur); - // fully connected - cur = wsp_ggml_mul_mat(ctx0, - layer.mlp_0_w, - cur); + wstate.embd_enc = cur; - wstate.use_buf(ctx0, 1); + //ggml_graph_print(gf); - cur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, layer.mlp_0_b, cur), - cur); + //////////////////////////////////////////////////////////////////////////// - wstate.use_buf(ctx0, 0); + //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__, + // ggml_used_mem(ctx0)/1024.0/1024.0, + // wstate.get_buf_max_mem(0)/1024.0/1024.0, + // wstate.get_buf_max_mem(1)/1024.0/1024.0, + // wstate.get_buf_max_mem(2)/1024.0/1024.0, + // wstate.get_buf_max_mem(3)/1024.0/1024.0); - // GELU activation - cur = wsp_ggml_gelu(ctx0, cur); + ggml_free(ctx0); - wstate.use_buf(ctx0, 1); + return gf; +} - // projection - cur = wsp_ggml_mul_mat(ctx0, - layer.mlp_1_w, - cur); +// pre-compute cross-attention memory +static struct ggml_cgraph * whisper_build_graph_cross( + whisper_context & wctx, + whisper_state & wstate) { + const auto & model = wctx.model; + const auto & hparams = model.hparams; - wstate.use_buf(ctx0, 0); + const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx; + const int n_state = hparams.n_audio_state; + const int n_head = hparams.n_audio_head; - cur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, layer.mlp_1_b, cur), - cur); -#endif - } + struct ggml_init_params params = { + /*.mem_size =*/ wstate.alloc_cross.meta.size(), + /*.mem_buffer =*/ wstate.alloc_cross.meta.data(), + /*.no_alloc =*/ true, + }; - wstate.use_buf(ctx0, 3); + struct ggml_context * ctx0 = ggml_init(params); - inpL = wsp_ggml_add(ctx0, cur, inpFF); - } + ggml_cgraph * gf = ggml_new_graph(ctx0); - cur = inpL; + ggml_allocr * alloc = wstate.alloc_cross.alloc; - // norm - { - wstate.use_buf(ctx0, 0); + struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc); - cur = wsp_ggml_norm(ctx0, cur); + struct ggml_tensor * Kscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(alloc, Kscale); - wstate.use_buf(ctx0, 1); + if (!ggml_allocr_is_measure(alloc)) { + ggml_set_f32(Kscale, pow(float(n_state) / n_head, -0.25)); + } - // cur = ln_f_g*cur + ln_f_b - cur = wsp_ggml_add(ctx0, - wsp_ggml_mul(ctx0, - wsp_ggml_repeat(ctx0, model.e_ln_w, cur), - cur), - wsp_ggml_repeat(ctx0, model.e_ln_b, cur)); - } + for (int il = 0; il < model.hparams.n_text_layer; ++il) { + auto & layer = model.layers_decoder[il]; - wstate.use_buf(ctx0, -1); + struct ggml_tensor* Kcross = ggml_mul_mat(ctx0, + layer.cross_attn_k_w, + cur); - // run the computation - { - struct wsp_ggml_cgraph gf = {}; - gf.n_threads = n_threads; + Kcross = ggml_scale(ctx0, Kcross, Kscale); - wsp_ggml_build_forward_expand(&gf, cur); - wsp_ggml_graph_compute(ctx0, &gf); + struct ggml_tensor* Vcross = ggml_mul_mat(ctx0, + layer.cross_attn_v_w, + cur); - //wsp_ggml_graph_print(&gf); - } + Vcross = ggml_add(ctx0, + Vcross, + layer.cross_attn_v_b); + + Vcross = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx)); + + struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, + n_state*n_ctx, + (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx)); + + struct ggml_tensor * v = ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state, + ( n_ctx)*ggml_element_size(wstate.kv_cross.v), + (il*n_ctx)*ggml_element_size(wstate.kv_cross.v)*n_state); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcross, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcross, v)); } -#ifdef WHISPER_USE_COREML - else if (use_coreml) { - wstate.use_buf(ctx0, -1); - cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx); + //ggml_graph_print(gf); + + ggml_free(ctx0); + + return gf; +} + +// evaluate the encoder with the given state +// +// given audio recording (more specifically, its log mel spectrogram), runs forward pass of the encoder +// part of the transformer model and returns the encoded features +// +// - wctx: the model +// - wstate: the state of the encoder +// - n_threads: number of threads to use +// - mel_offset: offset in the mel spectrogram (i.e. audio offset) +// +static bool whisper_encode_internal( + whisper_context & wctx, + whisper_state & wstate, + const int mel_offset, + const int n_threads) { + const int64_t t_start_us = ggml_time_us(); + + // conv + { + auto & alloc = wstate.alloc_conv.alloc; - whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data); - } -#endif -#ifdef WHISPER_USE_OPENVINO - else if (use_openvino) { - wstate.use_buf(ctx0, -1); + ggml_allocr_reset(alloc); - cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx); + ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset); - if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) { - return false; + ggml_allocr_alloc_graph(alloc, gf); + + if (!whisper_encode_external(wstate)) { + ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads); } } -#endif - - // cur - //{ - // printf("ne0 = %d\n", cur->ne[0]); - // printf("ne1 = %d\n", cur->ne[1]); - // for (int i = 0; i < 10; ++i) { - // printf("%8.4f ", ((float *)(cur->data))[i]); - // } - // printf("... "); - // for (int i = cur->ne[0] - 10; i < cur->ne[0]; ++i) { - // printf("%8.4f ", ((float *)(cur->data))[i]); - // } - // printf("\n"); - //} - - // pre-compute cross-attention memory - { - struct wsp_ggml_cgraph gf = {}; - gf.n_threads = n_threads; - - // TODO: hack to disconnect the encoded features from the previous graph - cur->op = WSP_GGML_OP_NONE; - cur->src0 = nullptr; - cur->src1 = nullptr; - - for (int il = 0; il < model.hparams.n_text_layer; ++il) { - auto& layer = model.layers_decoder[il]; - wstate.use_buf(ctx0, 0); + // encoder + if (!whisper_encode_external(wstate)) { + auto & alloc = wstate.alloc_encode.alloc; - struct wsp_ggml_tensor* Kcross = wsp_ggml_mul_mat(ctx0, - layer.cross_attn_k_w, - cur); + ggml_allocr_reset(alloc); - Kcross = wsp_ggml_scale_inplace(ctx0, Kcross, wsp_ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25))); + ggml_cgraph * gf = whisper_build_graph_encoder(wctx, wstate); - wstate.use_buf(ctx0, 1); + ggml_allocr_alloc_graph(alloc, gf); - struct wsp_ggml_tensor* Vcross = wsp_ggml_mul_mat(ctx0, - layer.cross_attn_v_w, - cur); +#ifdef GGML_USE_METAL + if (wstate.ctx_metal) { + ggml_metal_set_n_cb (wstate.ctx_metal, n_threads); + ggml_metal_graph_compute(wstate.ctx_metal, gf); + } else { + ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads); + } +#else + ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads); +#endif + } - Vcross = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, - layer.cross_attn_v_b, - Vcross), - Vcross); + // cross + { + auto & alloc = wstate.alloc_cross.alloc; - wstate.use_buf(ctx0, -1); + ggml_allocr_reset(alloc); - Vcross = wsp_ggml_transpose(ctx0, wsp_ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx)); + ggml_cgraph * gf = whisper_build_graph_cross(wctx, wstate); - struct wsp_ggml_tensor * k = wsp_ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (wsp_ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx)); - struct wsp_ggml_tensor * v = wsp_ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state, - ( n_ctx)*wsp_ggml_element_size(wstate.kv_cross.v), - (il*n_ctx)*wsp_ggml_element_size(wstate.kv_cross.v)*n_state); + ggml_allocr_alloc_graph(alloc, gf); - wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Kcross, k)); - wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Vcross, v)); +#ifdef GGML_USE_METAL + if (wstate.ctx_metal) { + ggml_metal_set_n_cb (wstate.ctx_metal, n_threads); + ggml_metal_graph_compute(wstate.ctx_metal, gf); + } else { + ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads); } - - wsp_ggml_graph_compute(ctx0, &gf); - //wsp_ggml_graph_print(&gf); +#else + ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads); +#endif } - //////////////////////////////////////////////////////////////////////////// - - //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__, - // wsp_ggml_used_mem(ctx0)/1024.0/1024.0, - // wstate.get_buf_max_mem(0)/1024.0/1024.0, - // wstate.get_buf_max_mem(1)/1024.0/1024.0, - // wstate.get_buf_max_mem(2)/1024.0/1024.0, - // wstate.get_buf_max_mem(3)/1024.0/1024.0); - - wsp_ggml_free(ctx0); + // ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); - wstate.t_encode_us += wsp_ggml_time_us() - t_start_us; + wstate.t_encode_us += ggml_time_us() - t_start_us; wstate.n_encode++; return true; } -// evaluate the decoder -// -// given text prompt + audio features -> computes the logits for the next token -// -// - model: the model -// - n_threads: number of threads to use -// - tokens: text prompt -// - n_tokens: number of tokens in the prompt -// - n_past: number of past tokens to prefix the prompt with -// -static bool whisper_decode_internal( - whisper_context & wctx, - whisper_state & wstate, - whisper_decoder & decoder, - const whisper_token * tokens, - const int n_tokens, - const int n_past, - const int n_threads) { - const int64_t t_start_us = wsp_ggml_time_us(); - +static struct ggml_cgraph * whisper_build_graph_decoder( + whisper_context & wctx, + whisper_state & wstate, + whisper_decoder & decoder, + const whisper_token * tokens, + int n_tokens, + int n_past) { const auto & model = wctx.model; const auto & hparams = model.hparams; @@ -1935,10 +2006,6 @@ static bool whisper_decode_internal( WHISPER_ASSERT(!!kv_self.ctx); - auto & logits_out = wstate.logits; - - const int n_vocab = hparams.n_vocab; - const int n_ctx = hparams.n_text_ctx; const int n_state = hparams.n_text_state; const int n_head = hparams.n_text_head; @@ -1949,387 +2016,401 @@ static bool whisper_decode_internal( //WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx); - struct wsp_ggml_init_params params = { - /*.mem_size =*/ wstate.buf_compute.size(), - /*.mem_buffer =*/ wstate.buf_compute.data(), - /*.no_alloc =*/ false, + struct ggml_init_params params = { + /*.mem_size =*/ wstate.alloc_decode.meta.size(), + /*.mem_buffer =*/ wstate.alloc_decode.meta.data(), + /*.no_alloc =*/ true, }; - struct wsp_ggml_context * ctx0 = wsp_ggml_init(params); + struct ggml_context * ctx0 = ggml_init(params); + + ggml_cgraph * gf = ggml_new_graph(ctx0); + + ggml_allocr * alloc = wstate.alloc_decode.alloc; - struct wsp_ggml_cgraph gf = {}; - gf.n_threads = n_threads; + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(alloc, embd); - struct wsp_ggml_tensor * embd = wsp_ggml_new_tensor_1d(ctx0, WSP_GGML_TYPE_I32, N); - memcpy(embd->data, tokens, N*wsp_ggml_element_size(embd)); + if (!ggml_allocr_is_measure(alloc)) { + memcpy(embd->data, tokens, N*ggml_element_size(embd)); + } + + struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(alloc, position); - struct wsp_ggml_tensor * position = wsp_ggml_new_tensor_1d(ctx0, WSP_GGML_TYPE_I32, N); - for (int i = 0; i < N; ++i) { - ((int32_t *) position->data)[i] = n_past + i; + if (!ggml_allocr_is_measure(alloc)) { + for (int i = 0; i < N; ++i) { + ((int32_t *) position->data)[i] = n_past + i; + } } - wstate.use_buf(ctx0, 3); + struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(alloc, KQscale); + + if (!ggml_allocr_is_measure(alloc)) { + ggml_set_f32(KQscale, pow(float(n_state)/n_head, -0.25)); + } // token encoding + position encoding - struct wsp_ggml_tensor * cur = - wsp_ggml_add(ctx0, - wsp_ggml_get_rows(ctx0, model.d_te, embd), - wsp_ggml_get_rows(ctx0, model.d_pe, position)); + struct ggml_tensor * cur = + ggml_add(ctx0, + ggml_get_rows(ctx0, model.d_te, embd), + ggml_get_rows(ctx0, model.d_pe, position)); - struct wsp_ggml_tensor * inpL = cur; + struct ggml_tensor * inpL = cur; for (int il = 0; il < n_layer; ++il) { const auto & layer = model.layers_decoder[il]; // norm { - wstate.use_buf(ctx0, 0); - - cur = wsp_ggml_norm(ctx0, inpL); + cur = ggml_norm(ctx0, inpL, hparams.eps); // cur = ln_0_w*cur + ln_0_b - cur = wsp_ggml_add(ctx0, - wsp_ggml_mul(ctx0, - wsp_ggml_repeat(ctx0, layer.attn_ln_0_w, cur), - cur), - wsp_ggml_repeat(ctx0, layer.attn_ln_0_b, cur)); + cur = ggml_add(ctx0, + ggml_mul(ctx0, + cur, + layer.attn_ln_0_w), + layer.attn_ln_0_b); } // self-attention { - struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0, + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.attn_q_w, cur); - Qcur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, - layer.attn_q_b, - Qcur), - Qcur); + Qcur = ggml_add(ctx0, + Qcur, + layer.attn_q_b); - Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + Qcur = ggml_scale(ctx0, Qcur, KQscale); // note: no bias for Key - struct wsp_ggml_tensor * Kcur = wsp_ggml_mul_mat(ctx0, + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.attn_k_w, cur); - Kcur = wsp_ggml_scale_inplace(ctx0, Kcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + Kcur = ggml_scale(ctx0, Kcur, KQscale); // store key and value to memory { - struct wsp_ggml_tensor * Vcur = wsp_ggml_mul_mat(ctx0, + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.attn_v_w, cur); - Vcur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, - layer.attn_v_b, - Vcur), - Vcur); + Vcur = ggml_add(ctx0, + Vcur, + layer.attn_v_b); - Vcur = wsp_ggml_transpose(ctx0, wsp_ggml_reshape_2d(ctx0, Vcur, n_state, N)); + Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_state, N)); - struct wsp_ggml_tensor * k = wsp_ggml_view_1d(ctx0, kv_self.k, N*n_state, (wsp_ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past)); - struct wsp_ggml_tensor * v = wsp_ggml_view_2d(ctx0, kv_self.v, N, n_state, - ( n_ctx)*wsp_ggml_element_size(kv_self.v), - (il*n_ctx)*wsp_ggml_element_size(kv_self.v)*n_state + n_past*wsp_ggml_element_size(kv_self.v)); + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_state, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_state + n_past*ggml_element_size(kv_self.v)); - wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Kcur, k)); - wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Vcur, v)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } // ------ - wstate.use_buf(ctx0, 0); - - struct wsp_ggml_tensor * Q = - wsp_ggml_permute(ctx0, - wsp_ggml_cpy(ctx0, - Qcur, - wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, N)), - 0, 2, 1, 3); - - struct wsp_ggml_tensor * K = - wsp_ggml_permute(ctx0, - wsp_ggml_reshape_3d(ctx0, - wsp_ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_state, il*n_ctx*wsp_ggml_element_size(kv_self.k)*n_state), - n_state/n_head, n_head, n_past + N), + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, Qcur, n_state/n_head, n_head, N), 0, 2, 1, 3); - wstate.use_buf(ctx0, 1); + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_state/n_head, n_past + N, n_head, + ggml_element_size(kv_self.k)*n_state, + ggml_element_size(kv_self.k)*n_state/n_head, + ggml_element_size(kv_self.k)*n_state*n_ctx*il); // K * Q - struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - //struct wsp_ggml_tensor * KQ_scaled = - // wsp_ggml_scale_inplace(ctx0, - // KQ, - // wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head)) - // ); + //struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - struct wsp_ggml_tensor * KQ_masked = wsp_ggml_diag_mask_inf_inplace(ctx0, KQ, n_past); + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past); - struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ_masked); + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - struct wsp_ggml_tensor * V = - wsp_ggml_view_3d(ctx0, kv_self.v, + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, n_past + N, n_state/n_head, n_head, - n_ctx*wsp_ggml_element_size(kv_self.v), - n_ctx*wsp_ggml_element_size(kv_self.v)*n_state/n_head, - il*n_ctx*wsp_ggml_element_size(kv_self.v)*n_state); + n_ctx*ggml_element_size(kv_self.v), + n_ctx*ggml_element_size(kv_self.v)*n_state/n_head, + il*n_ctx*ggml_element_size(kv_self.v)*n_state); - struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3); + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cur = wsp_ggml_cpy(ctx0, + cur = ggml_cpy(ctx0, KQV_merged, - wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, N)); + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, N)); } // projection { - wstate.use_buf(ctx0, 0); - - cur = wsp_ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx0, layer.attn_ln_1_w, cur); - wstate.use_buf(ctx0, 1); - - cur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, layer.attn_ln_1_b, cur), - cur); + cur = ggml_add(ctx0, + cur, + layer.attn_ln_1_b); } - wstate.use_buf(ctx0, 2); - // add the input - struct wsp_ggml_tensor * inpCA = wsp_ggml_add(ctx0, cur, inpL); + struct ggml_tensor * inpCA = ggml_add(ctx0, cur, inpL); // norm { - wstate.use_buf(ctx0, 0); - - cur = wsp_ggml_norm(ctx0, inpCA); // note: we use inpCA here + cur = ggml_norm(ctx0, inpCA, hparams.eps); // note: we use inpCA here // cur = ln_0_w*cur + ln_0_b - cur = wsp_ggml_add(ctx0, - wsp_ggml_mul(ctx0, - wsp_ggml_repeat(ctx0, layer.cross_attn_ln_0_w, cur), - cur), - wsp_ggml_repeat(ctx0, layer.cross_attn_ln_0_b, cur)); + cur = ggml_add(ctx0, + ggml_mul(ctx0, + cur, + layer.cross_attn_ln_0_w), + layer.cross_attn_ln_0_b); } // cross-attention { - struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0, + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.cross_attn_q_w, cur); - Qcur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, - layer.cross_attn_q_b, - Qcur), - Qcur); + Qcur = ggml_add(ctx0, + Qcur, + layer.cross_attn_q_b); - Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + Qcur = ggml_scale(ctx0, Qcur, KQscale); // Kcross is already scaled - struct wsp_ggml_tensor * Kcross = - wsp_ggml_reshape_3d(ctx0, - wsp_ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*wsp_ggml_element_size(wstate.kv_cross.k)*n_state), - n_state/n_head, n_head, M); - - //struct wsp_ggml_tensor * Vcross = - // wsp_ggml_reshape_3d(ctx0, - // wsp_ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*wsp_ggml_element_size(wstate.kv_cross.v)*n_state), + struct ggml_tensor * Kcross = + ggml_view_3d(ctx0, wstate.kv_cross.k, + n_state/n_head, M, n_head, + ggml_element_size(wstate.kv_cross.k)*n_state, + ggml_element_size(wstate.kv_cross.k)*n_state/n_head, + ggml_element_size(wstate.kv_cross.k)*n_state*M*il); + + //struct ggml_tensor * Vcross = + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state), // n_state/n_head, n_head, M); - //struct wsp_ggml_tensor * V_trans = - // wsp_ggml_cpy(ctx0, - // wsp_ggml_permute(ctx0, Vcross, 1, 2, 0, 3), - // wsp_ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head)); + //struct ggml_tensor * V_trans = + // ggml_cpy(ctx0, + // ggml_permute(ctx0, Vcross, 1, 2, 0, 3), + // ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head)); - struct wsp_ggml_tensor * V = - wsp_ggml_view_3d(ctx0, wstate.kv_cross.v, + struct ggml_tensor * V = + ggml_view_3d(ctx0, wstate.kv_cross.v, M, n_state/n_head, n_head, - M*wsp_ggml_element_size(wstate.kv_cross.v), - M*wsp_ggml_element_size(wstate.kv_cross.v)*n_state/n_head, - il*M*wsp_ggml_element_size(wstate.kv_cross.v)*n_state); + M*ggml_element_size(wstate.kv_cross.v), + M*ggml_element_size(wstate.kv_cross.v)*n_state/n_head, + il*M*ggml_element_size(wstate.kv_cross.v)*n_state); // ------ - struct wsp_ggml_tensor * Q = - wsp_ggml_permute(ctx0, - wsp_ggml_cpy(ctx0, - Qcur, - wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, N)), + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, Qcur, n_state/n_head, n_head, N), 0, 2, 1, 3); - struct wsp_ggml_tensor * K = wsp_ggml_permute(ctx0, Kcross, 0, 2, 1, 3); - // K * Q - struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, Kcross, Q); - //struct wsp_ggml_tensor * KQ_scaled = - // wsp_ggml_scale_inplace(ctx0, + //struct ggml_tensor * KQ_scaled = + // ggml_scale(ctx0, // KQ, - // wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head)) + // ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head)) // ); // no masking for cross-attention - //struct wsp_ggml_tensor * KQ_masked = wsp_ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ); + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ); - struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3); + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); // cur = KQV_merged.contiguous().view(n_state, N) - cur = wsp_ggml_cpy(ctx0, + cur = ggml_cpy(ctx0, KQV_merged, - wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, N)); + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, N)); } // projection { - wstate.use_buf(ctx0, 0); - - cur = wsp_ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx0, layer.cross_attn_ln_1_w, cur); - wstate.use_buf(ctx0, 1); - - cur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, layer.cross_attn_ln_1_b, cur), - cur); + cur = ggml_add(ctx0, + cur, + layer.cross_attn_ln_1_b); } - wstate.use_buf(ctx0, 2); - // add the input - cur = wsp_ggml_add(ctx0, cur, inpCA); + cur = ggml_add(ctx0, cur, inpCA); - struct wsp_ggml_tensor * inpFF = cur; + struct ggml_tensor * inpFF = cur; // feed-forward network { // norm { - wstate.use_buf(ctx0, 0); - - cur = wsp_ggml_norm(ctx0, inpFF); - - wstate.use_buf(ctx0, 1); + cur = ggml_norm(ctx0, inpFF, hparams.eps); // cur = mlp_ln_w*cur + mlp_ln_b - cur = wsp_ggml_add(ctx0, - wsp_ggml_mul(ctx0, - wsp_ggml_repeat(ctx0, layer.mlp_ln_w, cur), - cur), - wsp_ggml_repeat(ctx0, layer.mlp_ln_b, cur)); + cur = ggml_add(ctx0, + ggml_mul(ctx0, + cur, + layer.mlp_ln_w), + layer.mlp_ln_b); } - wstate.use_buf(ctx0, 0); - // fully connected - cur = wsp_ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx0, layer.mlp_0_w, cur); - wstate.use_buf(ctx0, 1); - - cur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, layer.mlp_0_b, cur), - cur); - - wstate.use_buf(ctx0, 0); + cur = ggml_add(ctx0, + cur, + layer.mlp_0_b); // GELU activation - cur = wsp_ggml_gelu(ctx0, cur); - - wstate.use_buf(ctx0, 1); + cur = ggml_gelu(ctx0, cur); // projection - cur = wsp_ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx0, layer.mlp_1_w, cur); - wstate.use_buf(ctx0, 0); - - cur = wsp_ggml_add(ctx0, - wsp_ggml_repeat(ctx0, layer.mlp_1_b, cur), - cur); + cur = ggml_add(ctx0, + cur, + layer.mlp_1_b); } - wstate.use_buf(ctx0, 3); - - inpL = wsp_ggml_add(ctx0, cur, inpFF); + inpL = ggml_add(ctx0, cur, inpFF); } cur = inpL; // norm { - wstate.use_buf(ctx0, 0); - - cur = wsp_ggml_norm(ctx0, cur); + cur = ggml_norm(ctx0, cur, hparams.eps); - wstate.use_buf(ctx0, 1); - - cur = wsp_ggml_add(ctx0, - wsp_ggml_mul(ctx0, - wsp_ggml_repeat(ctx0, model.d_ln_w, cur), - cur), - wsp_ggml_repeat(ctx0, model.d_ln_b, cur)); + cur = ggml_add(ctx0, + ggml_mul(ctx0, + cur, + model.d_ln_w), + model.d_ln_b); } - wstate.use_buf(ctx0, 0); - // compute logits only for the last token // comment this line to compute logits for all N tokens // might be useful in the future - cur = wsp_ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], (cur->ne[1] - 1)*cur->nb[1]); + cur = ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], (cur->ne[1] - 1)*cur->nb[1]); + + struct ggml_tensor * logits = ggml_mul_mat(ctx0, model.d_te, cur); + + ggml_build_forward_expand(gf, logits); + + ggml_free(ctx0); + + return gf; +} + +// evaluate the decoder +// +// given text prompt + audio features -> computes the logits for the next token +// +// - model: the model +// - n_threads: number of threads to use +// - tokens: text prompt +// - n_tokens: number of tokens in the prompt +// - n_past: number of past tokens to prefix the prompt with +// +static bool whisper_decode_internal( + whisper_context & wctx, + whisper_state & wstate, + whisper_decoder & decoder, + const whisper_token * tokens, + const int n_tokens, + const int n_past, + const int n_threads) { + const int64_t t_start_us = ggml_time_us(); + + const auto & model = wctx.model; + const auto & hparams = model.hparams; + + const int n_vocab = hparams.n_vocab; - struct wsp_ggml_tensor * logits = wsp_ggml_mul_mat(ctx0, model.d_te, cur); + auto & logits_out = wstate.logits; - wstate.use_buf(ctx0, -1); + struct ggml_tensor * logits; - // run the computation + // decoder { - wsp_ggml_build_forward_expand(&gf, logits); - wsp_ggml_graph_compute (ctx0, &gf); + auto & alloc = wstate.alloc_decode.alloc; + + ggml_allocr_reset(alloc); + + ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, decoder, tokens, n_tokens, n_past); + + ggml_allocr_alloc_graph(alloc, gf); + + logits = gf->nodes[gf->n_nodes - 1]; + +#ifdef GGML_USE_METAL + if (wstate.ctx_metal) { + ggml_metal_set_n_cb (wstate.ctx_metal, n_threads); + ggml_metal_graph_compute(wstate.ctx_metal, gf); + } else { + ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads); + } +#else + ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads); +#endif } // extract logits for all N tokens - //logits_out.resize(N*n_vocab); - //memcpy(logits_out.data(), wsp_ggml_get_data(logits), sizeof(float)*N*n_vocab); + //logits_out.resize(n_tokens*n_vocab); + //memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*n_tokens*n_vocab); // extract logits only for the last token logits_out.resize(n_vocab); - memcpy(logits_out.data(), wsp_ggml_get_data(logits), sizeof(float)*n_vocab); + memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*n_vocab); - if (N > 1) { + if (n_tokens > 1) { //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__, - // wsp_ggml_used_mem(ctx0)/1024.0/1024.0, + // ggml_used_mem(ctx0)/1024.0/1024.0, // wstate.get_buf_max_mem(0)/1024.0/1024.0, // wstate.get_buf_max_mem(1)/1024.0/1024.0, // wstate.get_buf_max_mem(2)/1024.0/1024.0, // wstate.get_buf_max_mem(3)/1024.0/1024.0); } - wsp_ggml_free(ctx0); - - wstate.t_decode_us += wsp_ggml_time_us() - t_start_us; - wstate.n_decode++; + if (n_tokens == 1) { + wstate.t_decode_us += ggml_time_us() - t_start_us; + wstate.n_decode++; + } else { + wstate.t_prompt_us += ggml_time_us() - t_start_us; + wstate.n_prompt++; + } return true; } + // 500 -> 00:05.000 // 6000 -> 01:00.000 static std::string to_timestamp(int64_t t, bool comma = false) { @@ -2351,7 +2432,7 @@ static std::string to_timestamp(int64_t t, bool comma = false) { static float sin_vals[SIN_COS_N_COUNT]; static float cos_vals[SIN_COS_N_COUNT]; -// In FFT, we frequently use sine and cosine operations with the same values. +// In FFT, we frequently use sine and cosine operations with the same values. // We can use precalculated values to speed up the process. static void fill_sin_cos_table() { static bool is_filled = false; @@ -2446,7 +2527,7 @@ static void fft(const std::vector & in, std::vector & out) { } static bool hann_window(int length, bool periodic, std::vector & output) { - if (output.size() < length) { + if (output.size() < static_cast(length)) { output.resize(length); } int offset = -1; @@ -2538,7 +2619,7 @@ static bool log_mel_spectrogram( const whisper_filters & filters, const bool debug, whisper_mel & mel) { - const int64_t t_start_us = wsp_ggml_time_us(); + const int64_t t_start_us = ggml_time_us(); // Hanning window (Use cosf to eliminate difference) // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html @@ -2606,7 +2687,7 @@ static bool log_mel_spectrogram( mel.data[i] = (mel.data[i] + 4.0)/4.0; } - wstate.t_mel_us += wsp_ggml_time_us() - t_start_us; + wstate.t_mel_us += ggml_time_us() - t_start_us; // Dump log_mel_spectrogram if (debug) { @@ -2738,27 +2819,25 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { fill_sin_cos_table(); whisper_state * state = new whisper_state; - const size_t scale = ctx->model.hparams.ftype ? 1 : 2; - - if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) { + if (!kv_cache_init(ctx->model.hparams, state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) { log("%s: kv_cache_init() failed for self-attention cache\n", __func__); delete state; return nullptr; } { - const size_t memory_size = wsp_ggml_nbytes(state->decoders[0].kv_self.k) + wsp_ggml_nbytes(state->decoders[0].kv_self.v); + const size_t memory_size = ggml_nbytes(state->decoders[0].kv_self.k) + ggml_nbytes(state->decoders[0].kv_self.v); log("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } - if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) { + if (!kv_cache_init(ctx->model.hparams, state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) { log("%s: kv_cache_init() failed for cross-attention cache\n", __func__); delete state; return nullptr; } { - const size_t memory_size = wsp_ggml_nbytes(state->kv_cross.k) + wsp_ggml_nbytes(state->kv_cross.v); + const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v); log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } @@ -2772,6 +2851,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { if (!state->ctx_coreml) { log("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str()); #ifndef WHISPER_COREML_ALLOW_FALLBACK + delete state; return nullptr; #endif } else { @@ -2786,15 +2866,111 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { // TAGS: WHISPER_DECODER_INIT state->decoders[0].sequence.tokens.reserve(ctx->model.hparams.n_text_ctx); - state->decoders[0].probs.reserve(ctx->vocab.n_vocab); - state->decoders[0].logits.reserve(ctx->vocab.n_vocab); + state->decoders[0].probs.reserve (ctx->vocab.n_vocab); + state->decoders[0].logits.reserve (ctx->vocab.n_vocab); state->decoders[0].logprobs.reserve(ctx->vocab.n_vocab); - state->buf_compute.resize(scale * std::max(MEM_REQ_ENCODE.at(ctx->model.type), MEM_REQ_DECODE.at(ctx->model.type))); - state->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type)); - state->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type)); - state->buf_scratch[2].resize(MEM_REQ_SCRATCH2.at(ctx->model.type)); - state->buf_scratch[3].resize(MEM_REQ_SCRATCH3.at(ctx->model.type)); + // conv allocator + { + whisper_allocr_graph_init(state->alloc_conv, + [&]() { + return whisper_build_graph_conv(*ctx, *state, 0); + }); + + log("%s: compute buffer (conv) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1024.0 / 1024.0); + } + + // encoder allocator + if (!whisper_encode_external(*state)) { + whisper_allocr_graph_init(state->alloc_encode, + [&]() { + return whisper_build_graph_encoder(*ctx, *state); + }); + + log("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1024.0 / 1024.0); + } + + // cross allocator + { + whisper_allocr_graph_init(state->alloc_cross, + [&]() { + return whisper_build_graph_cross(*ctx, *state); + }); + + log("%s: compute buffer (cross) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1024.0 / 1024.0); + } + + // decoder allocator + { + whisper_allocr_graph_init(state->alloc_decode, + [&]() { + const auto & hparams = ctx->model.hparams; + + // TODO: make sure this is the worst-case scenario + const int n_tokens = hparams.n_text_ctx; + const int n_past = 0; + + return whisper_build_graph_decoder(*ctx, *state, state->decoders[0], nullptr, n_tokens, n_past); + }); + + log("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1024.0 / 1024.0); + } + +#ifdef GGML_USE_METAL + state->ctx_metal = ggml_metal_init(1); + if (!state->ctx_metal) { + log("%s: ggml_metal_init() failed\n", __func__); + delete state; + return nullptr; + } + + log("%s: Metal context initialized\n", __func__); + + // this allocates all Metal resources and memory buffers + + void * data_ptr = NULL; + size_t data_size = 0; + + // TODO: add mmap support + //if (params.use_mmap) { + // data_ptr = ctx->model.mapping->addr; + // data_size = ctx->model.mapping->size; + //} else { + // data_ptr = ggml_get_mem_buffer(ctx->model.ctx); + // data_size = ggml_get_mem_size (ctx->model.ctx); + //} + + data_ptr = ggml_get_mem_buffer(ctx->model.ctx); + data_size = ggml_get_mem_size (ctx->model.ctx); + + const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); + + log("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + +#define WHISPER_METAL_CHECK_BUF(result) \ + if (!(result)) { \ + log("%s: failed to add metal buffer\n", __func__); \ + delete state; \ + return nullptr; \ + } + + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data", data_ptr, data_size, max_size)); + + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_conv", state->alloc_conv.meta.data(), state->alloc_conv.meta.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_encode", state->alloc_encode.meta.data(), state->alloc_encode.meta.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_cross", state->alloc_cross.meta.data(), state->alloc_cross.meta.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "meta_decode", state->alloc_decode.meta.data(), state->alloc_decode.meta.size(), 0)); + + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_conv", state->alloc_conv.data.data(), state->alloc_conv.data.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_encode", state->alloc_encode.data.data(), state->alloc_encode.data.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_cross", state->alloc_cross.data.data(), state->alloc_cross.data.size(), 0)); + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "data_decode", state->alloc_decode.data.data(), state->alloc_decode.data.size(), 0)); + + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "kv_cross", state->kv_cross.buf.data(), state->kv_cross.buf.size(), 0)); + + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, "kv_self_0", state->decoders[0].kv_self.buf.data(), state->decoders[0].kv_self.buf.size(), 0)); +#undef WHISPER_METAL_CHECK_BUF +#endif state->rng = std::mt19937(0); @@ -2851,7 +3027,6 @@ int whisper_ctx_init_openvino_encoder( } struct whisper_context * whisper_init_from_file_no_state(const char * path_model) { - log("%s: loading model from '%s'\n", __func__, path_model); auto fin = std::ifstream(path_model, std::ios::binary); @@ -2927,7 +3102,7 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t } struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader) { - wsp_ggml_time_init(); + ggml_time_init(); whisper_context * ctx = new whisper_context; @@ -3004,6 +3179,13 @@ void whisper_free_state(struct whisper_state * state) } #endif +#ifdef GGML_USE_METAL + if (state->ctx_metal) { + ggml_metal_free(state->ctx_metal); + state->ctx_metal = nullptr; + } +#endif + #ifdef WHISPER_USE_OPENVINO if (state->ctx_openvino != nullptr) { whisper_openvino_free(state->ctx_openvino); @@ -3011,6 +3193,11 @@ void whisper_free_state(struct whisper_state * state) } #endif + whisper_allocr_free(state->alloc_conv); + whisper_allocr_free(state->alloc_decode); + whisper_allocr_free(state->alloc_cross); + whisper_allocr_free(state->alloc_encode); + delete state; } } @@ -3018,7 +3205,7 @@ void whisper_free_state(struct whisper_state * state) void whisper_free(struct whisper_context * ctx) { if (ctx) { if (ctx->model.ctx) { - wsp_ggml_free(ctx->model.ctx); + ggml_free(ctx->model.ctx); } if (ctx->model.buf) { delete ctx->model.buf; @@ -3422,7 +3609,7 @@ whisper_token whisper_token_transcribe(struct whisper_context * ctx) { } void whisper_print_timings(struct whisper_context * ctx) { - const int64_t t_end_us = wsp_ggml_time_us(); + const int64_t t_end_us = ggml_time_us(); log("\n"); log("%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f); @@ -3431,12 +3618,14 @@ void whisper_print_timings(struct whisper_context * ctx) { const int32_t n_sample = std::max(1, ctx->state->n_sample); const int32_t n_encode = std::max(1, ctx->state->n_encode); const int32_t n_decode = std::max(1, ctx->state->n_decode); + const int32_t n_prompt = std::max(1, ctx->state->n_prompt); log("%s: fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h); log("%s: mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f); log("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample); log("%s: encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode); log("%s: decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode); + log("%s: prompt time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_prompt_us, n_prompt, 1e-3f * ctx->state->t_prompt_us / n_prompt); } log("%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f); } @@ -3446,6 +3635,11 @@ void whisper_reset_timings(struct whisper_context * ctx) { ctx->state->t_sample_us = 0; ctx->state->t_encode_us = 0; ctx->state->t_decode_us = 0; + ctx->state->t_prompt_us = 0; + ctx->state->n_sample = 0; + ctx->state->n_encode = 0; + ctx->state->n_decode = 0; + ctx->state->n_prompt = 0; } } @@ -3469,19 +3663,20 @@ const char * whisper_print_system_info(void) { static std::string s; s = ""; - s += "AVX = " + std::to_string(wsp_ggml_cpu_has_avx()) + " | "; - s += "AVX2 = " + std::to_string(wsp_ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(wsp_ggml_cpu_has_avx512()) + " | "; - s += "FMA = " + std::to_string(wsp_ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(wsp_ggml_cpu_has_neon()) + " | "; - s += "ARM_FMA = " + std::to_string(wsp_ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(wsp_ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(wsp_ggml_cpu_has_fp16_va()) + " | "; - s += "WASM_SIMD = " + std::to_string(wsp_ggml_cpu_has_wasm_simd()) + " | "; - s += "BLAS = " + std::to_string(wsp_ggml_cpu_has_blas()) + " | "; - s += "SSE3 = " + std::to_string(wsp_ggml_cpu_has_sse3()) + " | "; - s += "SSSE3 = " + std::to_string(wsp_ggml_cpu_has_ssse3()) + " | "; - s += "VSX = " + std::to_string(wsp_ggml_cpu_has_vsx()) + " | "; + s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; + s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; + s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; + s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; + s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; + s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; + s += "METAL = " + std::to_string(ggml_cpu_has_metal()) + " | "; + s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; + s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; + s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; + s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; + s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; + s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; + s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; s += "COREML = " + std::to_string(whisper_has_coreml()) + " | "; s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | "; @@ -3726,6 +3921,7 @@ static void whisper_process_logits( // [TDRZ] when tinydiarize is disabled, suppress solm token if (params.tdrz_enable == false) { + log("%s: TDRZ disabled, suppressing solm token\n", __func__); logits[vocab.token_solm] = -INFINITY; } @@ -3970,17 +4166,21 @@ static std::vector whisper_sample_token_topk( auto & logits_id = state.logits_id; - logits_id.clear(); + logits_id.resize(n_logits); for (int i = 0; i < n_logits; ++i) { - logits_id.push_back({ logits[i], i }); + logits_id[i].first = logits[i]; + logits_id[i].second = i; } - std::partial_sort( - logits_id.begin(), - logits_id.begin() + k, logits_id.end(), - [](const std::pair & a, const std::pair & b) { - return a.first > b.first; - }); + { + using pair_type = std::remove_reference::type::value_type; + std::partial_sort( + logits_id.begin(), + logits_id.begin() + k, logits_id.end(), + [](const pair_type & a, const pair_type & b) { + return a.first > b.first; + }); + } std::vector result; result.reserve(k); @@ -4075,6 +4275,115 @@ static void whisper_sequence_score( } } +static bool whisper_kv_swap_fast( + std::vector & view, + whisper_decoder src[], + std::vector & kv_swap_bufs, + const int & n_decoders) { + WHISPER_PRINT_DEBUG("%s: n_decoders %d\n", __func__, n_decoders); + + // (decoder->buffer->decoder or decoder->buffer + decoder->decoder) + std::set two_copy; // decoder indices require two copies to safely modify KV caches + + // (buffer->decoder or decoder->decoder) + std::set one_copy; // decoder indices require one copy to safely modify KV caches + + // (decoder<->decoder) + std::set p_swap_set; // decoder indices able to swap KV-cache pointers + std::vector> p_swap_vec; + p_swap_vec.reserve(n_decoders); + + // see https://github.com/ggerganov/whisper.cpp/wiki + for (int i = 0; i < n_decoders; i++) { + // zero-copy (no modification) + if (i == view[i] || view[i] < 0) { + continue; + } + + bool is_one_copy = true; + // since we modify data sequentially, we only consider decoder indices after current index + for (int j = i + 1; j < n_decoders; j++) { + if (i == view[j]) { + // detect symmetric diagram + if (j == view[i]) { + p_swap_set.insert(i); + p_swap_set.insert(j); + p_swap_vec.emplace_back(i, j); + } else { + two_copy.insert(i); + is_one_copy = false; + } + break; + } + } + if (is_one_copy) { + one_copy.insert(i); + } + } + + kv_swap_bufs.resize(n_decoders); + + for (int i = 0; i < n_decoders; i++) { + kv_swap_bufs[i].k.resize(ggml_nbytes(src[i].kv_self.k)); + kv_swap_bufs[i].v.resize(ggml_nbytes(src[i].kv_self.v)); + } + + for (auto & i : two_copy) { + // make a copy of KV caches + WHISPER_PRINT_DEBUG("%s: store KV cache into swap: idx %d\n", __func__, i); + memcpy(kv_swap_bufs[i].k.data(), src[i].kv_self.k->data, kv_swap_bufs[i].k.size()); + memcpy(kv_swap_bufs[i].v.data(), src[i].kv_self.v->data, kv_swap_bufs[i].v.size()); + } + + // since two-copy decoder KV caches are protected by kv_swap_bufs, modify them first + for (auto & i : two_copy) { + // skip the decoder indices that require pointer swapping + if (p_swap_set.find(i) != p_swap_set.end()) { + continue; + } + + if (two_copy.find(view[i]) != two_copy.end()) { + // modify KV caches of decoder using data from kv_swap_bufs + WHISPER_PRINT_DEBUG("%s: two-copy decoder using swap buffers: swap[%d] -> %d\n", __func__, view[i], i); + memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size()); + memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size()); + } else { + // modify KV caches of decoder using data from correspond decoder KV caches directly + WHISPER_PRINT_DEBUG("%s: two-copy decoder without swap buffers: %d -> %d\n", __func__, view[i], i); + memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, ggml_nbytes(src[view[i]].kv_self.k)); + memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, ggml_nbytes(src[view[i]].kv_self.v)); + } + } + + // then modify one-copy decoder KV caches + for (auto & i : one_copy) { + // skip the decoder indices that require pointer swapping + if (p_swap_set.find(i) != p_swap_set.end()) { + continue; + } + + if (two_copy.find(view[i]) != two_copy.end()) { + // modify KV caches of decoder using data from kv_swap_bufs + WHISPER_PRINT_DEBUG("%s: one-copy decoder using swap buffers: swap[%d] -> %d\n", __func__, view[i], i); + memcpy(src[i].kv_self.k->data, kv_swap_bufs[view[i]].k.data(), kv_swap_bufs[view[i]].k.size()); + memcpy(src[i].kv_self.v->data, kv_swap_bufs[view[i]].v.data(), kv_swap_bufs[view[i]].v.size()); + } else { + // modify KV caches of decoder using data from correspond decoder KV caches directly + WHISPER_PRINT_DEBUG("%s: one-copy decoder without swap buffers: %d -> %d\n", __func__, view[i], i); + memcpy(src[i].kv_self.k->data, src[view[i]].kv_self.k->data, ggml_nbytes(src[view[i]].kv_self.k)); + memcpy(src[i].kv_self.v->data, src[view[i]].kv_self.v->data, ggml_nbytes(src[view[i]].kv_self.v)); + } + } + + // swap the pointers + for (auto & i : p_swap_vec) { + WHISPER_PRINT_DEBUG("%s: swap pointers: %d <-> %d\n", __func__, i.first, i.second); + std::swap(src[i.first].kv_self, src[i.second].kv_self); + } + + return true; +} + int whisper_full_with_state( struct whisper_context * ctx, struct whisper_state * state, @@ -4182,6 +4491,21 @@ int whisper_full_with_state( decoder.probs.resize (ctx->vocab.n_vocab); decoder.logits.resize (ctx->vocab.n_vocab); decoder.logprobs.resize(ctx->vocab.n_vocab); + + // TODO: not very clean - look for a better way and potentially merging with the init of decoder 0 +#ifdef GGML_USE_METAL +#define WHISPER_METAL_CHECK_BUF(result) \ + if (!(result)) { \ + log("%s: failed to add metal buffer\n", __func__); \ + return 0; \ + } + + const std::string kv_name = "kv_self_" + std::to_string(j); + auto & kv_self = decoder.kv_self; + + WHISPER_METAL_CHECK_BUF(ggml_metal_add_buffer(state->ctx_metal, kv_name.c_str(), kv_self.buf.data(), kv_self.buf.size(), 0)); +#undef WHISPER_METAL_CHECK_BUF +#endif } } @@ -4197,7 +4521,7 @@ int whisper_full_with_state( // initial prompt if (!params.prompt_tokens && params.initial_prompt) { - prompt_tokens.resize(1024); + prompt_tokens.resize(2048); prompt_tokens.resize(whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size())); params.prompt_tokens = prompt_tokens.data(); params.prompt_n_tokens = prompt_tokens.size(); @@ -4238,14 +4562,6 @@ int whisper_full_with_state( std::vector prompt; prompt.reserve(whisper_n_text_ctx(ctx)); - // beam-search helpers - struct kv_buf { - std::vector k; - std::vector v; - }; - - std::vector kv_bufs; - struct beam_candidate { int decoder_idx; int seek_delta; @@ -4368,7 +4684,7 @@ int whisper_full_with_state( } { - const int64_t t_start_sample_us = wsp_ggml_time_us(); + const int64_t t_start_sample_us = ggml_time_us(); whisper_process_logits(*ctx, *state, params, state->decoders[0], t_cur); @@ -4377,40 +4693,24 @@ int whisper_full_with_state( for (int j = 1; j < n_decoders_cur; ++j) { auto & decoder = state->decoders[j]; - memcpy(decoder.kv_self.k->data, state->decoders[0].kv_self.k->data, wsp_ggml_nbytes(decoder.kv_self.k)); - memcpy(decoder.kv_self.v->data, state->decoders[0].kv_self.v->data, wsp_ggml_nbytes(decoder.kv_self.v)); + memcpy(decoder.kv_self.k->data, state->decoders[0].kv_self.k->data, ggml_nbytes(decoder.kv_self.k)); + memcpy(decoder.kv_self.v->data, state->decoders[0].kv_self.v->data, ggml_nbytes(decoder.kv_self.v)); decoder.kv_self.n += prompt.size(); - memcpy(decoder.probs.data(), state->decoders[0].probs.data(), decoder.probs.size()*sizeof(decoder.probs[0])); - memcpy(decoder.logits.data(), state->decoders[0].logits.data(), decoder.logits.size()*sizeof(decoder.logits[0])); + memcpy(decoder.probs.data(), state->decoders[0].probs.data(), decoder.probs.size()*sizeof(decoder.probs[0])); + memcpy(decoder.logits.data(), state->decoders[0].logits.data(), decoder.logits.size()*sizeof(decoder.logits[0])); memcpy(decoder.logprobs.data(), state->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0])); } - state->t_sample_us += wsp_ggml_time_us() - t_start_sample_us; + state->t_sample_us += ggml_time_us() - t_start_sample_us; } } for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) { - const int64_t t_start_sample_us = wsp_ggml_time_us(); + const int64_t t_start_sample_us = ggml_time_us(); - // store the KV caches of all decoders when doing beam-search if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) { - kv_bufs.resize(n_decoders_cur); - for (int j = 0; j < n_decoders_cur; ++j) { - auto & decoder = state->decoders[j]; - - if (decoder.completed || decoder.failed) { - continue; - } - - kv_bufs[j].k.resize(wsp_ggml_nbytes(decoder.kv_self.k)); - kv_bufs[j].v.resize(wsp_ggml_nbytes(decoder.kv_self.v)); - - memcpy(kv_bufs[j].k.data(), decoder.kv_self.k->data, kv_bufs[j].k.size()); - memcpy(kv_bufs[j].v.data(), decoder.kv_self.v->data, kv_bufs[j].v.size()); - } - beam_candidates.clear(); } @@ -4458,6 +4758,7 @@ int whisper_full_with_state( }); uint32_t cur_c = 0; + std::vector decoder_idx(n_decoders_cur, -1); for (int j = 0; j < n_decoders_cur; ++j) { auto & decoder = state->decoders[j]; @@ -4476,12 +4777,13 @@ int whisper_full_with_state( decoder.seek_delta = cur.seek_delta; decoder.has_ts = cur.has_ts; - memcpy(decoder.kv_self.k->data, kv_bufs[cur.decoder_idx].k.data(), kv_bufs[cur.decoder_idx].k.size()); - memcpy(decoder.kv_self.v->data, kv_bufs[cur.decoder_idx].v.data(), kv_bufs[cur.decoder_idx].v.size()); - + decoder_idx[j] = cur.decoder_idx; WHISPER_PRINT_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n", __func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all); } + + // update KV caches + whisper_kv_swap_fast(decoder_idx, state->decoders, state->kv_swap_bufs, n_decoders_cur); } // update the decoder state @@ -4585,7 +4887,7 @@ int whisper_full_with_state( } } - state->t_sample_us += wsp_ggml_time_us() - t_start_sample_us; + state->t_sample_us += ggml_time_us() - t_start_sample_us; // obtain logits for the next token for (int j = 0; j < n_decoders_cur; ++j) { @@ -4606,13 +4908,13 @@ int whisper_full_with_state( } { - const int64_t t_start_sample_us = wsp_ggml_time_us(); + const int64_t t_start_sample_us = ggml_time_us(); whisper_process_logits(*ctx, *state, params, decoder, t_cur); ++decoder.kv_self.n; - state->t_sample_us += wsp_ggml_time_us() - t_start_sample_us; + state->t_sample_us += ggml_time_us() - t_start_sample_us; } } } @@ -4718,7 +5020,9 @@ int whisper_full_with_state( } // [TDRZ] record if speaker turn was predicted after current segment + log("%s: tdrz enabled: %s", __func__, params.tdrz_enable ? "true" : "false"); if (params.tdrz_enable && tokens_cur[i].id == whisper_token_solm(ctx)) { + log("%s: speaker turn predicted after current segment %d\n", __func__, i); speaker_turn_next = true; } @@ -4910,6 +5214,12 @@ int whisper_full_parallel( ctx->state->t_sample_us += states[i]->t_sample_us; ctx->state->t_encode_us += states[i]->t_encode_us; ctx->state->t_decode_us += states[i]->t_decode_us; + ctx->state->t_prompt_us += states[i]->t_prompt_us; + + ctx->state->n_sample += states[i]->n_sample; + ctx->state->n_encode += states[i]->n_encode; + ctx->state->n_decode += states[i]->n_decode; + ctx->state->n_prompt += states[i]->n_prompt; whisper_free_state(states[i]); } @@ -5032,7 +5342,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) { s = ""; char strbuf[256]; - wsp_ggml_time_init(); + ggml_time_init(); size_t n = 20; size_t arr = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations @@ -5053,11 +5363,11 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) { double sum = 0.0; for (size_t i = 0; i < n; i++) { - const int64_t t0 = wsp_ggml_time_us(); + const int64_t t0 = ggml_time_us(); memcpy(dst, src, size); - const int64_t t1 = wsp_ggml_time_us(); + const int64_t t1 = ggml_time_us(); tsum += (t1 - t0)*1e-6; @@ -5082,17 +5392,17 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) { return s.c_str(); } -WHISPER_API int whisper_bench_wsp_ggml_mul_mat(int n_threads) { - fputs(whisper_bench_wsp_ggml_mul_mat_str(n_threads), stderr); +WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) { + fputs(whisper_bench_ggml_mul_mat_str(n_threads), stderr); return 0; } -WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) { +WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { static std::string s; s = ""; char strbuf[256]; - wsp_ggml_time_init(); + ggml_time_init(); const int n_max = 128; @@ -5106,7 +5416,8 @@ WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) { // b: N*N*sizeof(float) // c: N*N*sizeof(float) // when F16 is used, there is an extra work buffer of size N*N*sizeof(float) - std::vector buf(4llu*N_max*N_max*sizeof(float) + 4*512); + std::vector buf(3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead()); + std::vector work; // put a bunch of random data in the buffer for (size_t i = 0; i < buf.size(); i++) buf[i] = i; @@ -5132,45 +5443,43 @@ WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) { const size_t N = sizes[j]; for (int k = 0; k < 7; ++k) { - const wsp_ggml_type wtype = - k == 0 ? WSP_GGML_TYPE_Q4_0 : - k == 1 ? WSP_GGML_TYPE_Q4_1 : - k == 2 ? WSP_GGML_TYPE_Q5_0 : - k == 3 ? WSP_GGML_TYPE_Q5_1 : - k == 4 ? WSP_GGML_TYPE_Q8_0 : - k == 5 ? WSP_GGML_TYPE_F16 : WSP_GGML_TYPE_F32; + const ggml_type wtype = + k == 0 ? GGML_TYPE_Q4_0 : + k == 1 ? GGML_TYPE_Q4_1 : + k == 2 ? GGML_TYPE_Q5_0 : + k == 3 ? GGML_TYPE_Q5_1 : + k == 4 ? GGML_TYPE_Q8_0 : + k == 5 ? GGML_TYPE_F16 : GGML_TYPE_F32; double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32; int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32; - struct wsp_ggml_init_params gparams = { + struct ggml_init_params gparams = { /*.mem_size =*/ buf.size(), /*.mem_buffer =*/ buf.data(), /*.no_alloc =*/ false, }; - struct wsp_ggml_context * ctx0 = wsp_ggml_init(gparams); + struct ggml_context * ctx0 = ggml_init(gparams); - struct wsp_ggml_tensor * a = wsp_ggml_new_tensor_2d(ctx0, wtype, N, N); - struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, N, N); + struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N); + struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N); - struct wsp_ggml_tensor * c = wsp_ggml_mul_mat(ctx0, a, b); + struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b); - struct wsp_ggml_cgraph gf = wsp_ggml_build_forward(c); - - gf.n_threads = n_threads; + struct ggml_cgraph gf = ggml_build_forward(c); double tsum = 0.0; // heat-up - wsp_ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_helper(work, &gf, n_threads); for (int i = 0; i < n_max; ++i) { - const int64_t t0 = wsp_ggml_time_us(); + const int64_t t0 = ggml_time_us(); - wsp_ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_helper(work, &gf, n_threads); - const int64_t t1 = wsp_ggml_time_us(); + const int64_t t1 = ggml_time_us(); tsum += (t1 - t0)*1e-6; n++; @@ -5180,7 +5489,7 @@ WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) { } } - wsp_ggml_free(ctx0); + ggml_free(ctx0); s = ((2.0*N*N*N*n)/tsum)*1e-9; } @@ -5509,4 +5818,4 @@ static void whisper_exp_compute_token_level_timestamps( void whisper_set_log_callback(whisper_log_callback callback) { whisper_log = callback; -} +} \ No newline at end of file diff --git a/cpp/whisper.h b/cpp/whisper.h index a5e4936..9b2a022 100644 --- a/cpp/whisper.h +++ b/cpp/whisper.h @@ -516,8 +516,8 @@ extern "C" { WHISPER_API int whisper_bench_memcpy (int n_threads); WHISPER_API const char * whisper_bench_memcpy_str (int n_threads); - WHISPER_API int whisper_bench_wsp_ggml_mul_mat (int n_threads); - WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads); + WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads); + WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads); // Control logging output; default behavior is to print to stderr @@ -528,4 +528,4 @@ extern "C" { } #endif -#endif +#endif \ No newline at end of file diff --git a/docs/API/README.md b/docs/API/README.md index f276dbb..d9f12df 100644 --- a/docs/API/README.md +++ b/docs/API/README.md @@ -82,6 +82,7 @@ ___ | `prompt?` | `string` | Initial Prompt | | `speedUp?` | `boolean` | Speed up audio by x2 (reduced accuracy) | | `temperature?` | `number` | Tnitial decoding temperature | +| `tdrzEnable?` | `boolean` | Enable tinydiarize https://github.com/ggerganov/whisper.cpp/pull/1058 | | `temperatureInc?` | `number` | - | | `tokenTimestamps?` | `boolean` | Enable token-level timestamps | | `translate?` | `boolean` | Translate from source language to english (Default: false) | diff --git a/ios/RNWhisperContext.mm b/ios/RNWhisperContext.mm index 6401fd6..0f9aa12 100644 --- a/ios/RNWhisperContext.mm +++ b/ios/RNWhisperContext.mm @@ -381,6 +381,7 @@ - (struct whisper_full_params)getParams:(NSDictionary *)options jobId:(int)jobId params.print_special = false; params.speed_up = options[@"speedUp"] != nil ? [options[@"speedUp"] boolValue] : false; params.translate = options[@"translate"] != nil ? [options[@"translate"] boolValue] : false; + params.tdrz_enable = options[@"tdrzEnable"] != nil ? [options[@"tdrzEnable"] boolValue] : false; params.language = options[@"language"] != nil ? [options[@"language"] UTF8String] : "auto"; params.n_threads = n_threads > 0 ? n_threads : default_n_threads; params.offset_ms = 0; diff --git a/src/NativeRNWhisper.ts b/src/NativeRNWhisper.ts index b7992a1..68175f6 100644 --- a/src/NativeRNWhisper.ts +++ b/src/NativeRNWhisper.ts @@ -30,6 +30,8 @@ export type TranscribeOptions = { bestOf?: number, /** Speed up audio by x2 (reduced accuracy) */ speedUp?: boolean, + /** Enable tinydiarize (https://github.com/ggerganov/whisper.cpp/pull/1058) */ + tdrzEnable?: boolean, /** Initial Prompt */ prompt?: string, }