From a181bd5ac07c85ed794c074facfe8f4ac6afac44 Mon Sep 17 00:00:00 2001 From: Ruihang Lai Date: Thu, 18 May 2023 15:13:39 -0700 Subject: [PATCH] GPT-NeoX allocating full-length KV cache (#179) This PR changes the GPT-NeoX KV cache creation function to create to full size at the beginning, so no memory allocation will be required when running on the fly. --- mlc_llm/relax_model/gpt_neox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlc_llm/relax_model/gpt_neox.py b/mlc_llm/relax_model/gpt_neox.py index b4968a714b..258c194edb 100644 --- a/mlc_llm/relax_model/gpt_neox.py +++ b/mlc_llm/relax_model/gpt_neox.py @@ -593,7 +593,7 @@ def create_kv_cache_func( ) -> None: init_shape = relax.ShapeExpr( ( - 1, + config.max_sequence_length, config.num_attention_heads, config.hidden_size // config.num_attention_heads, )