WIP

GordonSmith · Nov 23, 2023 · c27ccfc · c27ccfc
1 parent 8d842b3
commit c27ccfc
Show file tree

Hide file tree

Showing 12 changed files with 386 additions and 4 deletions.
diff --git a/package.json b/package.json
@@ -8,13 +8,14 @@
     "webassembly",
     "wasm",
     "dot",
+    "llama",
     "zstd",
     "graphviz-dot",
     "zstandard",
     "expat",
     "base91",
     "expat-xml-parser",
-    "webassemby"
+    "webassembly"
   ],
   "type": "module",
   "exports": {
@@ -167,4 +168,4 @@
     "url": "https://github.com/hpcc-systems/hpcc-js-wasm/issues"
   },
   "homepage": "https://hpcc-systems.github.io/hpcc-js-wasm/"
-}
+}
diff --git a/rollup.config.js b/rollup.config.js
@@ -107,8 +107,9 @@ export default args => {
         browserTplIndex("lib-esm/index", "dist/index.umd", "dist/index"),
 
         browserTpl("lib-esm/base91", "dist/base91.umd", "dist/base91"),
-        browserTpl("lib-esm/graphviz", "dist/graphviz.umd", "dist/graphviz"),
         browserTpl("lib-esm/expat", "dist/expat.umd", "dist/expat"),
+        browserTpl("lib-esm/graphviz", "dist/graphviz.umd", "dist/graphviz"),
+        browserTpl("lib-esm/llama", "dist/llama.umd", "dist/llama"),
         browserTpl("lib-esm/zstd", "dist/zstd.umd", "dist/zstd"),
 
         browserTpl("lib-esm/__tests__/index-browser", "dist-test/index.umd", "dist-test/index"),

diff --git a/src-cpp/CMakeLists.txt b/src-cpp/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(base91)
 add_subdirectory(expat)
 add_subdirectory(graphviz)
+add_subdirectory(llama)
 add_subdirectory(zstd)
diff --git a/src-cpp/llama/CMakeLists.txt b/src-cpp/llama/CMakeLists.txt
@@ -0,0 +1,44 @@
+project(llamalib)
+
+find_package(Llama CONFIG REQUIRED)
+
+# See:  https://github.com/emscripten-core/emscripten/blob/main/src/settings.js
+string(REPLACE ";" " " CPP_FLAGS "${EM_CPP_FLAGS}")
+
+set(EM_LINK_FLAGS 
+    ${EM_LINK_FLAGS}
+    "-sEXPORT_NAME='${CMAKE_PROJECT_NAME}'"
+    "-sEXPORTED_FUNCTIONS=\"[]\""
+    "-sEXPORTED_RUNTIME_METHODS=\"[UTF8ToString]\""
+    "--post-js ${CMAKE_CURRENT_BINARY_DIR}/main_glue.js"
+)
+string(REPLACE ";" " " LINK_FLAGS "${EM_LINK_FLAGS}")
+
+#  Generate Glue from IDL file  ---
+add_custom_command(
+    MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/main.idl
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/main_glue.js ${CMAKE_CURRENT_BINARY_DIR}/main_glue.cpp
+    COMMAND python3 ${CMAKE_BINARY_DIR}/../emsdk/upstream/emscripten/tools/webidl_binder.py ${CMAKE_CURRENT_SOURCE_DIR}/main.idl ${CMAKE_CURRENT_BINARY_DIR}/main_glue
+)
+set_property(SOURCE main.cpp APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/main_glue.cpp)
+#  ---  ---  ---
+
+include_directories(
+    ${VCPKG_INCLUDE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR}
+    ${Llama_DIR}/common
+)
+
+add_executable(llamalib
+    main.cpp
+    ${Llama_DIR}/common/common.cpp
+)
+
+set_target_properties(llamalib PROPERTIES COMPILE_FLAGS "${CPP_FLAGS}")
+set_target_properties(llamalib PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+
+target_link_libraries(llamalib
+    PRIVATE llama
+)
+
+packWasm(llamalib)
diff --git a/src-cpp/llama/main.cpp b/src-cpp/llama/main.cpp
@@ -0,0 +1,221 @@
+#include <stdlib.h>
+
+#include "common.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+int main(int argc, char **argv)
+{
+    gpt_params params;
+
+    if (argc == 1 || argv[1][0] == '-')
+    {
+        printf("usage: %s MODEL_PATH [PROMPT]\n", argv[0]);
+        return 1;
+    }
+
+    if (argc >= 2)
+    {
+        params.model = argv[1];
+    }
+
+    if (argc >= 3)
+    {
+        params.prompt = argv[2];
+    }
+
+    if (params.prompt.empty())
+    {
+        params.prompt = "Hello my name is";
+    }
+
+    // total length of the sequence including the prompt
+    const int n_len = 32;
+
+    // init LLM
+
+    llama_backend_init(params.numa);
+
+    // initialize the model
+
+    llama_model_params model_params = llama_model_default_params();
+
+    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
+
+    llama_model *model = llama_load_model_from_file(params.model.c_str(), model_params);
+
+    if (model == NULL)
+    {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
+    }
+
+    // initialize the context
+
+    llama_context_params ctx_params = llama_context_default_params();
+
+    ctx_params.seed = 1234;
+    ctx_params.n_ctx = 2048;
+    ctx_params.n_threads = params.n_threads;
+    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+
+    llama_context *ctx = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx == NULL)
+    {
+        fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
+        return 1;
+    }
+
+    // tokenize the prompt
+
+    std::vector<llama_token> tokens_list;
+    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
+
+    const int n_ctx = llama_n_ctx(ctx);
+    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
+
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
+
+    // make sure the KV cache is big enough to hold all the prompt and generated tokens
+    if (n_kv_req > n_ctx)
+    {
+        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
+        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        return 1;
+    }
+
+    // print the prompt token-by-token
+
+    fprintf(stderr, "\n");
+
+    for (auto id : tokens_list)
+    {
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+    }
+
+    fflush(stderr);
+
+    // create a llama_batch with size 512
+    // we use this object to submit token data for decoding
+
+    llama_batch batch = llama_batch_init(512, 0, 1);
+
+    // evaluate the initial prompt
+    for (size_t i = 0; i < tokens_list.size(); i++)
+    {
+        llama_batch_add(batch, tokens_list[i], i, {0}, false);
+    }
+
+    // llama_decode will output logits only for the last token of the prompt
+    batch.logits[batch.n_tokens - 1] = true;
+
+    if (llama_decode(ctx, batch) != 0)
+    {
+        LOG_TEE("%s: llama_decode() failed\n", __func__);
+        return 1;
+    }
+
+    // main loop
+
+    int n_cur = batch.n_tokens;
+    int n_decode = 0;
+
+    const auto t_main_start = ggml_time_us();
+
+    while (n_cur <= n_len)
+    {
+        // sample the next token
+        {
+            auto n_vocab = llama_n_vocab(model);
+            auto *logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+
+            std::vector<llama_token_data> candidates;
+            candidates.reserve(n_vocab);
+
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++)
+            {
+                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+            }
+
+            llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
+
+            // sample the most likely token
+            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+
+            // is it an end of stream?
+            if (new_token_id == llama_token_eos(model) || n_cur == n_len)
+            {
+                LOG_TEE("\n");
+
+                break;
+            }
+
+            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            fflush(stdout);
+
+            // prepare the next batch
+            llama_batch_clear(batch);
+
+            // push this new token for next evaluation
+            llama_batch_add(batch, new_token_id, n_cur, {0}, true);
+
+            n_decode += 1;
+        }
+
+        n_cur += 1;
+
+        // evaluate the current batch with the transformer model
+        if (llama_decode(ctx, batch))
+        {
+            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            return 1;
+        }
+    }
+
+    LOG_TEE("\n");
+
+    const auto t_main_end = ggml_time_us();
+
+    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
+
+    llama_print_timings(ctx);
+
+    fprintf(stderr, "\n");
+
+    llama_batch_free(batch);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    return 0;
+}
+
+struct llama
+{
+public:
+    static const char *version(void)
+    {
+        return "0.0.1";
+    }
+
+    static void *malloc(size_t __size)
+    {
+        return ::malloc(__size);
+    }
+
+    static void free(void *__ptr)
+    {
+        ::free(__ptr);
+    }
+};
+
+//  Include JS Glue  ---
+#include "main_glue.cpp"
diff --git a/src-cpp/llama/main.idl b/src-cpp/llama/main.idl
@@ -0,0 +1,7 @@
+interface llama
+{
+    [Const] static DOMString version();
+
+    static any malloc(unsigned long size);
+    static void free(any ptr);
+};
diff --git a/src-cpp/zstd/CMakeLists.txt b/src-cpp/zstd/CMakeLists.txt
@@ -32,7 +32,7 @@ add_executable(zstdlib
     main.cpp
 )
 
-set_target_properties(graphvizlib PROPERTIES COMPILE_FLAGS "${CPP_FLAGS}")
+set_target_properties(zstdlib PROPERTIES COMPILE_FLAGS "${CPP_FLAGS}")
 set_target_properties(zstdlib PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
 
 target_link_libraries(zstdlib

diff --git a/src-ts/index.ts b/src-ts/index.ts
@@ -13,6 +13,11 @@ export namespace Graphviz {
         return import("./graphviz.js").then(mod => mod.Graphviz.load());
     }
 }
+export namespace Llama {
+    export function load() {
+        return import("./llama.js").then(mod => mod.Llama.load());
+    }
+}
 export namespace Zstd {
     export function load() {
         return import("./zstd.js").then(mod => mod.Zstd.load());