From a181bd5ac07c85ed794c074facfe8f4ac6afac44 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <ruihangl@cs.cmu.edu>
Date: Thu, 18 May 2023 15:13:39 -0700
Subject: [PATCH] GPT-NeoX allocating full-length KV cache (#179)

This PR changes the GPT-NeoX KV cache creation function to create to
full size at the beginning, so no memory allocation will be required
when running on the fly.
---
 mlc_llm/relax_model/gpt_neox.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlc_llm/relax_model/gpt_neox.py b/mlc_llm/relax_model/gpt_neox.py
index b4968a714b..258c194edb 100644
--- a/mlc_llm/relax_model/gpt_neox.py
+++ b/mlc_llm/relax_model/gpt_neox.py
@@ -593,7 +593,7 @@ def create_kv_cache_func(
 ) -> None:
     init_shape = relax.ShapeExpr(
         (
-            1,
+            config.max_sequence_length,
             config.num_attention_heads,
             config.hidden_size // config.num_attention_heads,
         )