In [1]:
import os
from dotenv import load_dotenv
load_dotenv("WSDM-Cup-Multilingual-Chatbot-Arena-Kaggle/.env")

True

In [2]:
models = [
    "Qwen/Qwen2-7B-Instruct", # pass
    # "microsoft/Phi-3-small-8k-instruct", # fail (unpack error)
    "01-ai/Yi-1.5-9B-Chat", # pass
    "Qwen/Qwen1.5-14B-Chat-AWQ", # pass
    # "meta-llama/Llama-3.2-1B", # fail (chat template issue)
    "meta-llama/Llama-3.2-3B-Instruct", # pass
    "meta-llama/Llama-3.1-8B-Instruct", # pass
    "Qwen/Qwen2-1.5B-Instruct", # pass
    # "alpaca-13b", # fail
    "mistralai/Mistral-7B-Instruct-v0.2", # pass
    "mistralai/Mistral-7B-Instruct-v0.3", # pass
    "microsoft/Phi-3-mini-4k-instruct", # pass
    "HuggingFaceTB/SmolLM-1.7B-Instruct", # pass
    "google/gemma-2-2b-it", # pass
    "google/gemma-2-9b-it", # pass
    "Qwen/Qwen2.5-3B-Instruct", # pass
    "Qwen/Qwen2.5-7B-Instruct", # pass
    "Qwen/Qwen2.5-14B-Instruct-AWQ", # pass
    'Nexusflow/Starling-LM-7B-beta', # to check
    'Qwen/Qwen1.5-14B-Chat-AWQ', # to check
    'meta-llama/Meta-Llama-3-8B-Instruct', # to check
    # "Qwen/Qwen2.5-32B-Instruct-AWQ" # see if A100 can run this, OOM on A5000
    # "google/gemma-2-27b-it", # see if A100 can run this, OOM on A5000
]

from vllm import LLM, SamplingParams

In [3]:
import pickle
import torch
import gc
from datasets import load_dataset
with open("vllm_inference_ids.pkl", "rb") as f:
    ignore_ids = pickle.load(f)

ds = load_dataset("lmsys/lmsys-chat-1m", split="train").select(range(10_000))
ds = ds.filter(lambda example: example["language"] != "English", num_proc=8)
ds = ds.filter(lambda example: example["turn"] == 1, num_proc=8)
ds = ds.filter(lambda x: x["conversation_id"] not in ignore_ids, batch_size=1000,num_proc=8)

In [4]:
import random

def select_model(example):
    return {'gen_model': random.choice(models)}

In [5]:
ds = ds.map(select_model, num_proc=8)

In [6]:
def get_prompt(example):
    return {"prompt": next(msg["content"] for msg in example["conversation"] if msg["role"] == "user")}
ds = ds.map(get_prompt, num_proc=8)

In [None]:
if __name__ == "__main__":
    all_ds = []
    for model in models:
        # load the vllm instance here

        max_model_len = 8192
        if model in  ["01-ai/Yi-1.5-9B-Chat", "microsoft/Phi-3-mini-4k-instruct"] :
            max_model_len = 4096
        elif model == "HuggingFaceTB/SmolLM-1.7B-Instruct":
            max_model_len = 2048
        print(f"model: {model}")
        if model == "unsloth/gemma-2-27b-it-bnb-4bit":
            llm = LLM(
            model=model,
            dtype="auto",
            trust_remote_code=True,
            load_format="bitsandbytes",
            quantization="bitsandbytes", 
            seed=0,
            gpu_memory_utilization=0.95,
            max_model_len=max_model_len)
        else:    
            llm = LLM(
                model=model,
                dtype="auto",
                trust_remote_code=True,
                quantization="awq" if "awq" in model.lower() else None, 
                seed=0,
                gpu_memory_utilization=0.95,
                max_model_len=max_model_len
            )
        sampling_params = SamplingParams(n=1, temperature=0.7, max_tokens=4096, seed=0, top_p=0.9)
    
        # filter on gen_model
        model_ds = ds.filter(lambda x: x["gen_model"] == model, num_proc=8)
        print(f"Processing {len(model_ds)} samples for {model}")
    
        conversations = []
        for d in model_ds:
            conversations.append([
            {
                "role": "user",
                "content": d["prompt"]
            },
        ])
    
        outputs = llm.chat(conversations, sampling_params=sampling_params)
        responses = []
        for o in outputs:
            response = o.outputs[0].text.strip()
            responses.append(response)
        model_ds = model_ds.add_column(name="generated_response", column=responses)
        all_ds.append(model_ds)
    
        del llm
        torch.cuda.empty_cache()
        gc.collect()

model: Qwen/Qwen2-7B-Instruct
INFO 01-08 06:32:18 config.py:510] This model supports multiple tasks: {'classify', 'score', 'reward', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 01-08 06:32:18 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='Qwen/Qwen2-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 01-08 06:33:03 model_runner.py:1099] Loading model weights took 14.2487 GB
INFO 01-08 06:33:05 worker.py:241] Memory profiling takes 1.72 seconds
INFO 01-08 06:33:05 worker.py:241] the current vLLM instance can use total_gpu_memory (23.68GiB) x gpu_memory_utilization (0.95) = 22.50GiB
INFO 01-08 06:33:05 worker.py:241] model weights take 14.25GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.44GiB; the rest of the memory reserved for KV Cache is 6.71GiB.
INFO 01-08 06:33:05 gpu_executor.py:76] # GPU blocks: 7852, # CPU blocks: 4681
INFO 01-08 06:33:05 gpu_executor.py:80] Maximum concurrency for 8192 tokens per request: 15.34x
INFO 01-08 06:33:10 model_runner.py:1415] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utiliza

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:15<00:00,  2.24it/s]

INFO 01-08 06:33:26 model_runner.py:1535] Graph capturing finished in 16 secs, took 0.20 GiB
INFO 01-08 06:33:26 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 22.52 seconds
Processing 65 samples for Qwen/Qwen2-7B-Instruct
INFO 01-08 06:33:26 chat_utils.py:333] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.



Processed prompts: 100%|██████████| 65/65 [00:44<00:00,  1.45it/s, est. speed input: 121.73 toks/s, output: 383.06 toks/s]


model: 01-ai/Yi-1.5-9B-Chat
INFO 01-08 06:34:19 config.py:510] This model supports multiple tasks: {'classify', 'score', 'reward', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 01-08 06:34:19 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='01-ai/Yi-1.5-9B-Chat', speculative_config=None, tokenizer='01-ai/Yi-1.5-9B-Chat', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 01-08 06:34:25 model_runner.py:1099] Loading model weights took 16.4470 GB
INFO 01-08 06:34:26 worker.py:241] Memory profiling takes 1.01 seconds
INFO 01-08 06:34:26 worker.py:241] the current vLLM instance can use total_gpu_memory (23.68GiB) x gpu_memory_utilization (0.95) = 22.50GiB
INFO 01-08 06:34:26 worker.py:241] model weights take 16.45GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 0.61GiB; the rest of the memory reserved for KV Cache is 5.43GiB.
INFO 01-08 06:34:26 gpu_executor.py:76] # GPU blocks: 3709, # CPU blocks: 2730
INFO 01-08 06:34:26 gpu_executor.py:80] Maximum concurrency for 4096 tokens per request: 14.49x
INFO 01-08 06:34:32 model_runner.py:1415] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utiliza

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:16<00:00,  2.11it/s]

INFO 01-08 06:34:49 model_runner.py:1535] Graph capturing finished in 17 secs, took 0.14 GiB
INFO 01-08 06:34:49 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 24.11 seconds
Processing 52 samples for 01-ai/Yi-1.5-9B-Chat
INFO 01-08 06:34:49 chat_utils.py:333] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.



Processed prompts: 100%|██████████| 52/52 [01:32<00:00,  1.77s/it, est. speed input: 111.91 toks/s, output: 329.71 toks/s]


model: Qwen/Qwen1.5-14B-Chat-AWQ
INFO 01-08 06:36:24 config.py:510] This model supports multiple tasks: {'classify', 'score', 'reward', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 01-08 06:36:24 awq_marlin.py:113] Detected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference
INFO 01-08 06:36:24 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='Qwen/Qwen1.5-14B-Chat-AWQ', speculative_config=None, tokenizer='Qwen/Qwen1.5-14B-Chat-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 01-08 06:36:28 model_runner.py:1099] Loading model weights took 9.0681 GB
INFO 01-08 06:36:31 worker.py:241] Memory profiling takes 3.07 seconds
INFO 01-08 06:36:31 worker.py:241] the current vLLM instance can use total_gpu_memory (23.68GiB) x gpu_memory_utilization (0.95) = 22.50GiB
INFO 01-08 06:36:31 worker.py:241] model weights take 9.07GiB; non_torch_memory takes 0.01GiB; PyTorch activation peak memory takes 1.46GiB; the rest of the memory reserved for KV Cache is 11.96GiB.
INFO 01-08 06:36:32 gpu_executor.py:76] # GPU blocks: 980, # CPU blocks: 327
INFO 01-08 06:36:32 gpu_executor.py:80] Maximum concurrency for 8192 tokens per request: 1.91x
INFO 01-08 06:36:32 model_runner.py:1415] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:22<00:00,  1.55it/s]

INFO 01-08 06:36:54 model_runner.py:1535] Graph capturing finished in 23 secs, took 0.48 GiB
INFO 01-08 06:36:54 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 26.04 seconds
Processing 56 samples for Qwen/Qwen1.5-14B-Chat-AWQ
INFO 01-08 06:36:54 chat_utils.py:333] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.



Processed prompts: 100%|██████████| 56/56 [00:41<00:00,  1.35it/s, est. speed input: 125.95 toks/s, output: 342.22 toks/s]


model: meta-llama/Llama-3.2-3B-Instruct
INFO 01-08 06:37:40 config.py:510] This model supports multiple tasks: {'classify', 'score', 'reward', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 01-08 06:37:40 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='meta-llama/Llama-3.2-3B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.2-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execut

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 01-08 06:37:43 model_runner.py:1099] Loading model weights took 6.0160 GB
INFO 01-08 06:37:44 worker.py:241] Memory profiling takes 0.79 seconds
INFO 01-08 06:37:44 worker.py:241] the current vLLM instance can use total_gpu_memory (23.68GiB) x gpu_memory_utilization (0.95) = 22.50GiB
INFO 01-08 06:37:44 worker.py:241] model weights take 6.02GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 1.21GiB; the rest of the memory reserved for KV Cache is 15.24GiB.
INFO 01-08 06:37:44 gpu_executor.py:76] # GPU blocks: 8917, # CPU blocks: 2340
INFO 01-08 06:37:44 gpu_executor.py:80] Maximum concurrency for 8192 tokens per request: 17.42x
INFO 01-08 06:37:44 model_runner.py:1415] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilizat

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:14<00:00,  2.40it/s]

INFO 01-08 06:37:59 model_runner.py:1535] Graph capturing finished in 15 secs, took 0.05 GiB
INFO 01-08 06:37:59 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 15.72 seconds
Processing 57 samples for meta-llama/Llama-3.2-3B-Instruct
INFO 01-08 06:37:59 chat_utils.py:333] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.



Processed prompts: 100%|██████████| 57/57 [00:17<00:00,  3.24it/s, est. speed input: 175.30 toks/s, output: 705.74 toks/s] 


model: meta-llama/Llama-3.1-8B-Instruct
INFO 01-08 06:38:20 config.py:510] This model supports multiple tasks: {'classify', 'score', 'reward', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 01-08 06:38:20 llm_engine.py:234] Initializing an LLM engine (v0.6.6.post1) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execut

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


In [None]:
from datasets import concatenate_datasets
final_ds = concatenate_datasets(all_ds)

In [None]:
final_df = []
for f in final_ds:
    # Randomly decide which response goes to A and B
    orig_response = next(msg["content"] for msg in f["conversation"] if msg["role"] == "assistant")
    if random.random() < 0.5:
        response_a = f["generated_response"]
        response_b = orig_response
        model_a = f["gen_model"]
        model_b = f["model"]
    else:
        response_a = orig_response
        response_b = f["generated_response"]
        model_a = f["model"]
        model_b = f["gen_model"]
        
    final_df.append({
        "id": f["conversation_id"],
        "prompt": f["prompt"],
        "response_a": response_a,
        "response_b": response_b,
        "model_a": model_a,
        "model_b": model_b,
        "language": f["language"]
    })

In [None]:
import pandas as pd
final_df = pd.DataFrame(final_df)

In [None]:
final_df