In [1]:
import csv
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import islice
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from datasets import load_dataset

SEED = 1337
random.seed(SEED)

INFO 03-03 00:07:33 __init__.py:190] Automatically detected platform cuda.


In [None]:
dataset = load_dataset("google-research-datasets/natural_questions")
dataset

Resolving data files:   0%|          | 0/287 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/287 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/235 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'question', 'long_answer_candidates', 'annotations'],
        num_rows: 307373
    })
    validation: Dataset({
        features: ['id', 'document', 'question', 'long_answer_candidates', 'annotations'],
        num_rows: 7830
    })
})

In [2]:
path = "../../data/data_raw/natural_questions.csv"
batch_size = 512

In [None]:
# with open(path, mode='w', newline='', encoding='utf-8') as file:
#     writer = csv.writer(file)
#     writer.writerow(["document", "question", "answer"])

In [None]:
documents, questions, answers = [], [], []

for item in tqdm(dataset["train"]):
    idx = np.random.randint(len(item["long_answer_candidates"]["start_token"]))
    start = item["long_answer_candidates"]["start_token"][idx]
    end = item["long_answer_candidates"]["end_token"][idx]
    tokens = item["document"]["tokens"]

    question = " ".join(token for token in item["question"]["tokens"])

    ans = tokens["token"][start:end]
    ans_is_html = tokens["is_html"][start:end]
    ans = " ".join([token for token, html in zip(ans, ans_is_html) if not html])

    doc_is_html = tokens["is_html"]
    document = " ".join([token for token, html in zip(tokens["token"], doc_is_html) if not html])

    documents.append(document)
    questions.append(question)
    answers.append(ans)
    if len(documents) == batch_size:
        with open(path, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            for document, question, answer in zip(documents, questions, answers):
                writer.writerow([document, question, answer])

        documents, questions, answers = [], [], []

 71%|███████▏  | 219702/307373 [1:06:07<26:36, 54.92it/s]  

In [None]:
len(documents), len(questions), len(answers)

(173, 173, 173)

In [None]:
with open(path, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    for document, question, answer in zip(documents, questions, answers):
        writer.writerow([document, question, answer])

In [None]:
documents, questions, answers = [], [], []

for item in tqdm(dataset["validation"]):
    idx = np.random.randint(len(item["long_answer_candidates"]["start_token"]))
    start = item["long_answer_candidates"]["start_token"][idx]
    end = item["long_answer_candidates"]["end_token"][idx]
    tokens = item["document"]["tokens"]

    question = " ".join(token for token in item["question"]["tokens"])

    ans = tokens["token"][start:end]
    ans_is_html = tokens["is_html"][start:end]
    ans = " ".join([token for token, html in zip(ans, ans_is_html) if not html])

    doc_is_html = tokens["is_html"]
    document = " ".join([token for token, html in zip(tokens["token"], doc_is_html) if not html])

    documents.append(document)
    questions.append(question)
    answers.append(ans)
    if len(documents) == batch_size:
        with open(path, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            for document, question, answer in zip(documents, questions, answers):
                writer.writerow([document, question, answer])

        documents, questions, answers = [], [], []

100%|██████████| 7830/7830 [03:07<00:00, 41.79it/s] 


In [None]:
len(documents), len(questions), len(answers)

(150, 150, 150)

In [None]:
with open(path, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    for document, question, answer in zip(documents, questions, answers):
        writer.writerow([document, question, answer])

In [3]:
def batchify(iterable, batch_size):
    """Splits an iterable into smaller batches."""
    iterable = iter(iterable)
    while batch := list(islice(iterable, batch_size)):
        yield batch

def save_to_csv(path, prompts, responses, temperature, top_p, top_k):
    """Saves prompts, responses and sampling parameters to a CSV file."""
    with open(path, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for prompt, response in zip(prompts, responses):
            writer.writerow([prompt, response, temperature, top_p, top_k])

def generate_responses(model, prompts, sampling_params):
    """Generate a batch of outputs using vLLM with customizable sampling parameters."""
    outputs = model.chat(prompts, sampling_params=sampling_params, use_tqdm=False)
    
    return [sample.outputs[0].text.replace('"', '') for sample in outputs]

In [4]:
BASE_PROMPT = [{"role": "system", "content": "You are a helpful assistant for answering questions based on provided context. Based on provided context, which will be in a form of a copy of wikipedia article, answer the question. MAKE SURE TO REPLAY ONLY WITH THE ANSWER."},
                {"role": "user", "content": "Context: \n {context} \n Question: {question}"},
                {"role": "assistant", "content": "Answer: \n"}]

In [5]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

In [12]:
tl1, tl2, tl3, tl4, tl5 = [], [], [], [], []
for df in pd.read_csv(path, chunksize=10_000):
    prompts = [
    [
        BASE_PROMPT[0],  # The system message
        {"role": "user", "content": BASE_PROMPT[1]["content"].format(context=context, question=question)},  # Formatted user message
        BASE_PROMPT[2]  # The assistant message
    ]
    for context, question in df[["document", "question"]].values
    ]
    lens = []
    batch_size = 128
    for prompts_batch in tqdm(batchify(prompts, batch_size), total=len(prompts) // batch_size):
        tokens = tokenizer.apply_chat_template(prompts_batch)
        lens.extend([len(token) for token in tokens])
    too_large1 = [i for i, l in enumerate(lens) if l > 32_768]
    too_large2 = [i for i, l in enumerate(lens) if l > 32_768 * 2]
    too_large3 = [i for i, l in enumerate(lens) if l > 32_768 * 3]
    too_large4 = [i for i, l in enumerate(lens) if l > 32_768 * 1/2]
    too_large5 = [i for i, l in enumerate(lens) if l > 32_768 * 1/4]

    tl1.extend(too_large1)
    tl2.extend(too_large2)
    tl3.extend(too_large3)
    tl4.extend(too_large4)
    tl5.extend(too_large5)

79it [04:26,  3.38s/it]                        
79it [04:22,  3.33s/it]                        
79it [04:21,  3.30s/it]                        
79it [03:55,  2.98s/it]                        
79it [03:48,  2.89s/it]                        
79it [04:17,  3.26s/it]                        
79it [04:19,  3.28s/it]                        
79it [04:16,  3.25s/it]                        
79it [04:26,  3.38s/it]                        
79it [03:45,  2.85s/it]                        
79it [04:24,  3.35s/it]                        
79it [04:28,  3.40s/it]                        
79it [04:35,  3.49s/it]                        
79it [04:33,  3.47s/it]                        
79it [04:27,  3.39s/it]                        
79it [04:10,  3.17s/it]                        
79it [03:53,  2.96s/it]                        
79it [04:05,  3.11s/it]                        
79it [03:53,  2.95s/it]                        
79it [03:53,  2.96s/it]                        
79it [02:57,  2.25s/it]                 

In [13]:
len(tl1), len(tl2), len(tl3), len(tl4), len(tl5)

(9038, 368, 14, 53149, 126596)

In [None]:
df.drop(["document", "question"], axis=1, inplace=True)
df.to_csv("../../data/data_human/natural_questions.csv", index=False)

In [None]:
sampling_params = [
    SamplingParams(temperature=0.0, top_p=1.0, top_k=-1, max_tokens=30_000, seed=SEED),  # Pure Greedy (fully deterministic)
    SamplingParams(temperature=0.2, top_p=1.0, top_k=-1, max_tokens=30_000, seed=SEED),  # Highly Deterministic
    SamplingParams(temperature=0.5, top_p=0.95, top_k=100, max_tokens=30_000, seed=SEED), # Mildly Deterministic but Flexible
    SamplingParams(temperature=0.7, top_p=0.9, top_k=50, max_tokens=30_000, seed=SEED),  # Balanced and Natural
    SamplingParams(temperature=0.9, top_p=0.8, top_k=40, max_tokens=30_000, seed=SEED),  # Slightly More Diverse but Coherent
    SamplingParams(temperature=1.0, top_p=0.95, top_k=30, max_tokens=30_000, seed=SEED), # Default Creative Mode
    SamplingParams(temperature=1.2, top_p=0.7, top_k=20, max_tokens=30_000, seed=SEED),  # Highly Creative
]

In [None]:
llms = ["meta-llama/Llama-3.2-1B-Instruct"]
batch_size = 8
base_path = "../../data/data_ai/natural_questions/natural-questions_"

In [None]:
for llm in llms:
    model = LLM(model=llm, dtype="half", max_model_len = 10_000)
    csv_path = f"{base_path}{llm.split('/')[-1]}.csv"


    # init csv file
    with open(csv_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["prompt", "response", "temperature", "top_p", "top_k"])

    cnt = 0
    for prompts_batch in tqdm(batchify(prompts, batch_size), total=len(prompts) // batch_size):
        params = random.choice(sampling_params)
        responses = generate_responses(model, prompts_batch, params)
        save_to_csv(csv_path, prompts_batch, responses, params.temperature, params.top_p, params.top_k)
        cnt += 1
        if cnt > 2:
            break

INFO 02-15 19:39:43 config.py:542] This model supports multiple tasks: {'embed', 'score', 'generate', 'reward', 'classify'}. Defaulting to 'generate'.
INFO 02-15 19:39:43 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='meta-llama/Llama-3.2-1B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.2-1B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=10000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama/Llama-3.2

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 02-15 19:40:13 model_runner.py:1115] Loading model weights took 2.3185 GB
INFO 02-15 19:40:16 worker.py:267] Memory profiling takes 2.28 seconds
INFO 02-15 19:40:16 worker.py:267] the current vLLM instance can use total_gpu_memory (6.00GiB) x gpu_memory_utilization (0.90) = 5.40GiB
INFO 02-15 19:40:16 worker.py:267] model weights take 2.32GiB; non_torch_memory takes 0.04GiB; PyTorch activation peak memory takes 1.21GiB; the rest of the memory reserved for KV Cache is 1.84GiB.
INFO 02-15 19:40:16 executor_base.py:110] # CUDA blocks: 3761, # CPU blocks: 8192
INFO 02-15 19:40:16 executor_base.py:115] Maximum concurrency for 10000 tokens per request: 6.02x
INFO 02-15 19:41:00 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utiliz

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:32<00:00,  1.07it/s]

INFO 02-15 19:41:33 model_runner.py:1562] Graph capturing finished in 33 secs, took 0.12 GiB
INFO 02-15 19:41:33 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 79.90 seconds



  0%|          | 0/28338 [00:00<?, ?it/s]

INFO 02-15 19:41:33 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


  0%|          | 2/28338 [00:13<52:26:24,  6.66s/it]


In [None]:
# df = pd.read_csv("../../data/data_ai/natural_questions/natural-questions_Llama-3.2-1B-Instruct.csv")
# df.head()

Unnamed: 0,prompt,response,temperature,top_p,top_k
0,"[{'role': 'system', 'content': 'You are a help...",**Clean-up Operations Continue After Storm Fra...,1.2,0.7,20
1,"[{'role': 'system', 'content': 'You are a help...",**TRAGEDY STRIKES Belfast City Centre: Tourist...,1.2,0.7,20
2,"[{'role': 'system', 'content': 'You are a help...",LEWIS HAMILTON SENSATION AS HAMILTON TAKES OVE...,1.2,0.7,20
3,"[{'role': 'system', 'content': 'You are a help...",I cannot generate an article that contains exp...,1.2,0.7,20
4,"[{'role': 'system', 'content': 'You are a help...","ISTANBUL, TURKEY - A 35-year-old man who was b...",1.2,0.7,20


In [None]:
# 

In [None]:
# 