# Efficient Dataset Enrichment Using vLLM

This notebook implements an efficient batch inference pipeline using vLLM for dataset enrichment. Key features include :

*   Parallel inference with Qwen2-VL model distributed across 4 GPUs  
*   Large-scale processing with batches of 120 samples
*   Custom vision-language prompting system  
*   Automated JSON output logging with error handling
*   Multi-modal processing pipeline (images and text)        
    
*To use this notebook, you need to:*

1.   Choose your model path (default: local Qwen2-VL)
2.   Select your dataset from Hugging Face hub  
3.   Define your custom prompt based on your use case  

*Note:*  While the overall code structure is reusable, **the prompt format must be adapted to your specific model**. The current implementation shows Qwen2-VL's format - please refer to vLLM documentation for the correct prompt template.

In [None]:
def setup_model():
    """ Loads a local model distributed across 4 GPUs for parallel inference"""
    return LLM(
        # loading local model, here qwen2-vl
        model='./model/qwen2',
        tensor_parallel_size=4,
        max_num_seqs=5,
    )

def ensure_rgb_image(image):
    """ Converts images to RGB format using PIL """
    try:
        if image.mode != 'RGB':
            return image.convert('RGB')
        return image
    except Exception as e:
        print(f"Error converting image: {e}")
        return None

In [None]:
# Load dataset
dataset = load_dataset("your/dataset")

# Define prompt system
prompt = """ Enter prompt """

train_data = dataset['train']
llm = setup_model()

# initialize output JSON file with timestamp
output_file = f"results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"

batch_size = 120
sampling_params = SamplingParams(temperature=0.2, max_tokens=512)

# Process batches and save results
with open(output_file, 'w') as f:
    for i in tqdm(range(0, len(dataset['train']), batch_size)):
        batch = dataset['train'][i:i + batch_size]

        # inputs with Qwen2-VL specific prompt format
        inputs = [{
            "prompt": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n",
            "multi_modal_data": {
                     "image": ensure_rgb_image(img)
            }
        } for img in batch['image']]

        # Generate and save results
        try:
            outputs = llm.generate(inputs, sampling_params)
            for j, out in enumerate(outputs):
                json.dump({"idx": i+j, "text": out.outputs[0].text}, f)
                f.write('\n')
            f.flush()
        except Exception as e:
            print(f"Batch {i//batch_size} failed: {e}")

In [None]:
elements = []
with open(output_file, 'r') as f:
    for line in f:
        result = json.loads(line)
        generated_element = result['text']
        elements.append(generated_element)

# Verify data integrity
assert len(elements) == len(dataset['train']), f"Number of generated elements ({len(elements)}) doesn't match dataset size ({len(dataset['train'])})"

#Create new dataset with generated data column
new_dataset_dict = DatasetDict()

for split in dataset.keys():
    split_dict = dataset[split].to_dict()
    if split == 'train':
        split_dict['column_generated'] = elements

    new_dataset_dict[split] = Dataset.from_dict(split_dict)

print("New DatasetDict structure:")
print(new_dataset_dict)
print("\nNumber of examples per split:")

for split in new_dataset_dict:
    print(f"{split}: {len(new_dataset_dict[split])} examples")
print("\nColumns in train split:", new_dataset_dict['train'].column_names)

# Save your new enriched dataset
new_dataset_dict.save_to_disk("dataset_with_generated_data")