In [1]:
import json
import os, random
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def process_texts_with_llm(
    file_path: str,
    num_samples: int,
    model_name: str,
    max_length: int = 100,
    temperature: float = 0.7,
    top_p: float = 0.9,
    device: str = "cuda:6" if torch.cuda.is_available() else "cpu",
    **model_kwargs
) -> None:
    """
    Process text prompts through a transformer model and save results to a JSON file.
    
    Args:
        file_path (str): Path to input JSON file containing prompts
        num_samples (int): Number of prompts to process
        model_name (str): Name of the transformer model to use
        max_length (int): Maximum length of generated text
        temperature (float): Sampling temperature
        top_p (float): Top-p sampling parameter
        device (str): Device to run model on ('cuda' or 'cpu')
        **model_kwargs: Additional keyword arguments for model configuration
    """
    # Load input data
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Validate input
    if 'goals' not in data or not isinstance(data['goals'], list):
        raise ValueError("Input JSON must contain a 'goals' list")
    
    # Sample prompts
    all_prompts = data['goals']
    if num_samples > len(all_prompts):
        print(f"Warning: Requested {num_samples} samples but only {len(all_prompts)} available")
        num_samples = len(all_prompts)
    
    selected_prompts = random.sample(all_prompts, num_samples)
    
    # Initialize model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        **model_kwargs
    ).to(device)
    
    # Process prompts
    results = []
    for i, prompt in enumerate(selected_prompts, 1):
        print(f"Processing prompt {i}/{num_samples}")
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        
        # Generate text
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                temperature=temperature,
                top_p=top_p,
                pad_token_id=tokenizer.eos_token_id,
                do_sample=True
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        results.append({
            "prompt": prompt,
            "generated_text": generated_text
        })
    
    # Create output filename
    base_path, ext = os.path.splitext(file_path)
    output_path = f"{base_path}_generated_{num_samples}{ext}"
    
    # Save results
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump({
            "model_name": model_name,
            "parameters": {
                "max_length": max_length,
                "temperature": temperature,
                "top_p": top_p,
                **model_kwargs
            },
            "results": results
        }, f, indent=2, ensure_ascii=False)
    
    print(f"Results saved to: {output_path}")

In [12]:
process_texts_with_llm(
    file_path="/ephemeral/taremu/MarkLLM/dataset/advbench/advbench_subset.json",
    num_samples=5,
    model_name="cognitivecomputations/WizardLM-7B-Uncensored",
    max_length=800,
    temperature=0.8
)

Processing prompt 1/5
Processing prompt 2/5
Processing prompt 3/5
Processing prompt 4/5
Processing prompt 5/5
Results saved to: /ephemeral/taremu/MarkLLM/dataset/advbench/advbench_subset_generated_5.json


In [4]:
import json
import os
import random
from typing import Dict
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import logging

def process_goals(
    file_path: str,
    num_samples: int,
    model_name: str,
    config_options: Dict,
    **transformer_kwargs
) -> None:
    """
    Processes a JSON dataset of goals using a transformer model and saves the outputs.

    Args:
        file_path (str): Path to the input JSON file.
        num_samples (int): Number of samples to select from the dataset.
        model_name (str): Name of the transformer model to use.
        config_options (dict): Configuration options for text generation.
        **transformer_kwargs: Additional keyword arguments for the Transformers library.

    Returns:
        None
    """
    # Configure logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    try:
        # Validate file path
        if not os.path.isfile(file_path):
            logger.error(f"The file {file_path} does not exist.")
            raise FileNotFoundError(f"The file {file_path} does not exist.")

        # Load JSON data
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        goals = data.get("goals", [])
        if not goals:
            logger.error("The JSON file does not contain a 'goals' list.")
            raise ValueError("The JSON file does not contain a 'goals' list.")

        # Sample selection
        if num_samples > len(goals):
            logger.warning(
                f"Requested {num_samples} samples, but only {len(goals)} available. Selecting all available samples."
            )
            sampled_goals = goals
        else:
            sampled_goals = random.sample(goals, num_samples)

        logger.info(f"Selected {len(sampled_goals)} samples for processing.")

        # Load model and tokenizer
        logger.info(f"Loading model '{model_name}'...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)

        # Setup pipeline
        logger.info("Setting up the text generation pipeline...")
        generator = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            **transformer_kwargs
        )

        # Generate responses
        logger.info("Generating responses...")
        responses = {}
        for idx, prompt in enumerate(sampled_goals, 1):
            try:
                response = generator(
                    prompt,
                    **config_options
                )
                responses[f"response_{idx}"] = response[0]['generated_text']
                logger.debug(f"Generated response for prompt {idx}.")
            except Exception as e:
                logger.error(f"Error generating response for prompt {idx}: {e}")
                responses[f"response_{idx}"] = None

        # Prepare output file path
        dir_name, base_name = os.path.split(file_path)
        name, ext = os.path.splitext(base_name)
        output_file = os.path.join(dir_name, f"{name}_output{num_samples}{ext}")

        # Save responses to JSON
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump({"responses": responses}, f, ensure_ascii=False, indent=4)

        logger.info(f"Responses saved to {output_file}")

    except Exception as e:
        logger.exception(f"An error occurred during processing: {e}")
        raise


In [9]:
if __name__ == "__main__":
    config = {
        "max_length": 500,
        "temperature": 0.7,
        "top_p": 0.9,
        "num_return_sequences": 1
    }

    process_goals(
        file_path="/ephemeral/taremu/MarkLLM/dataset/advbench/advbench_subset.json",
        num_samples=10,
        model_name="gpt2",
        config_options=config,
        do_sample=True,
        top_k=50
    )


INFO:__main__:Selected 10 samples for processing.
INFO:__main__:Loading model 'gpt2'...
INFO:__main__:Setting up the text generation pipeline...
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
INFO:__main__:Generating responses...
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_t