diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py index 16d27f474c5..0c03a8422a8 100644 --- a/tests/integration/defs/accuracy/accuracy_core.py +++ b/tests/integration/defs/accuracy/accuracy_core.py @@ -450,90 +450,11 @@ class LongBenchV2(AccuracyTask): EVALUATOR_KWARGS = dict( dataset_path=DATASET_DIR, length="medium", - max_len=1280000, + max_len=120000, apply_chat_template=True, random_seed=0, ) - @staticmethod - def create_modified_model_dir(original_model_dir: str, - max_position_embeddings: int = 1280000, - model_max_length: int = 1280000) -> str: - """ - Create temporary directory with modified config files for long context evaluation. - - This method creates a temporary directory with symlinks to all model files except - config files, which are copied and modified to support longer context lengths. - This is useful for evaluating models on long context tasks that exceed the - original model's max_position_embeddings. - - Args: - original_model_dir: Path to the original model directory - max_position_embeddings: New value for max_position_embeddings in config.json - model_max_length: New value for model_max_length in tokenizer_config.json - - Returns: - Path to the temporary modified model directory - - Note: - The caller is responsible for cleaning up the temporary directory after use. - """ - import tempfile - - # Create temporary model directory with symlinks - temp_dir = tempfile.mkdtemp(prefix="longbench_v2_modified_model_") - logger.info(f"Created temporary model directory: {temp_dir}") - - # Create symlinks for all files except config files - for item in os.listdir(original_model_dir): - src = os.path.join(original_model_dir, item) - dst = os.path.join(temp_dir, item) - - # Skip config files - will handle them separately - if item in ["config.json", "tokenizer_config.json"]: - continue - - # Create symlink for other files/directories - os.symlink(src, dst) - logger.info(f" Symlinked: {item}") - - # Modify and copy config.json - config_src = os.path.join(original_model_dir, "config.json") - config_dst = os.path.join(temp_dir, "config.json") - if os.path.exists(config_src): - with open(config_src, 'r', encoding='utf-8') as f: - config = json.load(f) - - # Modify max_position_embeddings - original_max_pos = config.get('max_position_embeddings') - config['max_position_embeddings'] = max_position_embeddings - logger.info( - f" Modified config.json: max_position_embeddings {original_max_pos} -> {max_position_embeddings}" - ) - - with open(config_dst, 'w', encoding='utf-8') as f: - json.dump(config, f, indent=2, ensure_ascii=False) - - # Modify and copy tokenizer_config.json - tokenizer_config_src = os.path.join(original_model_dir, - "tokenizer_config.json") - tokenizer_config_dst = os.path.join(temp_dir, "tokenizer_config.json") - if os.path.exists(tokenizer_config_src): - with open(tokenizer_config_src, 'r', encoding='utf-8') as f: - tokenizer_config = json.load(f) - - # Modify model_max_length - original_max_len = tokenizer_config.get('model_max_length') - tokenizer_config['model_max_length'] = model_max_length - logger.info( - f" Modified tokenizer_config.json: model_max_length {original_max_len} -> {model_max_length}" - ) - - with open(tokenizer_config_dst, 'w', encoding='utf-8') as f: - json.dump(tokenizer_config, f, indent=2, ensure_ascii=False) - - return temp_dir - class CliFlowAccuracyTestHarness: # Model diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index af38a021c74..4afc0d26bf6 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -4167,103 +4167,88 @@ class TestDeepSeekR1LongBenchV2(LlmapiAccuracyTestHarness): @pytest.mark.skip_less_mpi_world_size(8) def test_fp8_8gpus(self): - original_model_dir = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528" - if not os.path.exists(original_model_dir): - pytest.skip(f"Model directory {original_model_dir} does not exist") + model_dir = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528" + if not os.path.exists(model_dir): + pytest.skip(f"Model directory {model_dir} does not exist") - temp_dir = None - try: - # Create modified model directory using LongBenchV2 static method - # This is a WAR for the fact that the model config is not modified to support long context. - # TODO: remove this once the model config is modified to support long context. - temp_dir = LongBenchV2.create_modified_model_dir(original_model_dir) - - # Configure model settings - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8, - enable_block_reuse=True, - enable_partial_reuse=False, - dtype="fp8") - - cuda_graph_config = CudaGraphConfig(enable_padding=True, - max_batch_size=32) + # Configure model settings + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8, + enable_block_reuse=True, + enable_partial_reuse=False, + dtype="fp8") - mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3) + cuda_graph_config = CudaGraphConfig(enable_padding=True, + max_batch_size=32) - moe_config = MoeConfig(backend='DEEPGEMM', max_num_tokens=32000) + mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3) - pytorch_config = dict(cuda_graph_config=cuda_graph_config, - kv_cache_config=kv_cache_config, - speculative_config=mtp_config, - moe_config=moe_config, - enable_chunked_prefill=True, - enable_autotuner=True) + moe_config = MoeConfig(backend='DEEPGEMM', max_num_tokens=32000) - # Create LLM instance and evaluate - with LLM(temp_dir, - tensor_parallel_size=8, - moe_expert_parallel_size=8, - max_num_tokens=32000, - max_batch_size=32, - **pytorch_config) as llm: + pytorch_config = dict(cuda_graph_config=cuda_graph_config, + kv_cache_config=kv_cache_config, + speculative_config=mtp_config, + moe_config=moe_config, + enable_chunked_prefill=True, + enable_autotuner=True) - task = LongBenchV2(self.MODEL_NAME) + # Create LLM instance and evaluate + with LLM(model_dir, + tensor_parallel_size=8, + moe_expert_parallel_size=8, + max_num_tokens=32000, + max_batch_size=32, + **pytorch_config) as llm: - sampling_params = SamplingParams(max_tokens=32000) + task = LongBenchV2(self.MODEL_NAME) - task.evaluate(llm, sampling_params=sampling_params) + sampling_params = SamplingParams( + max_tokens=32000, + truncate_prompt_tokens=128000, + ) - finally: - # Cleanup temporary files - if temp_dir and os.path.exists(temp_dir): - import shutil - shutil.rmtree(temp_dir, ignore_errors=True) + task.evaluate(llm, sampling_params=sampling_params) @pytest.mark.skip_less_mpi_world_size(4) def test_nvfp4_4gpus(self): - original_model_dir = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528-FP4" - temp_dir = None - try: - # Create modified model directory using LongBenchV2 static method - temp_dir = LongBenchV2.create_modified_model_dir(original_model_dir) - - # Configure model settings (no MOE config for FP4 version) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8, - enable_block_reuse=True, - enable_partial_reuse=False, - dtype="fp8") - - cuda_graph_config = CudaGraphConfig(enable_padding=True, - max_batch_size=32) - - mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3) - - pytorch_config = dict(cuda_graph_config=cuda_graph_config, - kv_cache_config=kv_cache_config, - speculative_config=mtp_config, - enable_chunked_prefill=True, - enable_autotuner=True) - - # Create LLM instance and evaluate - with LLM(temp_dir, - tensor_parallel_size=4, - moe_expert_parallel_size=4, - max_num_tokens=32000, - max_batch_size=32, - **pytorch_config) as llm: + model_dir = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528-FP4" + if not os.path.exists(model_dir): + pytest.skip(f"Model directory {model_dir} does not exist") - assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + # Configure model settings (no MOE config for FP4 version) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8, + enable_block_reuse=True, + enable_partial_reuse=False, + dtype="fp8") + + cuda_graph_config = CudaGraphConfig(enable_padding=True, + max_batch_size=32) - task = LongBenchV2(self.MODEL_NAME) + mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3) - sampling_params = SamplingParams(max_tokens=32000) + pytorch_config = dict(cuda_graph_config=cuda_graph_config, + kv_cache_config=kv_cache_config, + speculative_config=mtp_config, + enable_chunked_prefill=True, + enable_autotuner=True) - task.evaluate(llm, sampling_params=sampling_params) + # Create LLM instance and evaluate + with LLM(model_dir, + tensor_parallel_size=4, + moe_expert_parallel_size=4, + max_num_tokens=32000, + max_batch_size=32, + **pytorch_config) as llm: - finally: - # Cleanup temporary files - if temp_dir and os.path.exists(temp_dir): - import shutil - shutil.rmtree(temp_dir, ignore_errors=True) + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + + task = LongBenchV2(self.MODEL_NAME) + + sampling_params = SamplingParams( + max_tokens=32000, + truncate_prompt_tokens=128000, + ) + + task.evaluate(llm, sampling_params=sampling_params) class TestStarcoder2_3B(LlmapiAccuracyTestHarness):