In [None]:
!nvidia-smi

In [None]:
!pip install ipywidgets



In [None]:
!huggingface-cli login --token 

# Testing Huggingface models

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
prompt = "Explain how photosynthesis works in simple terms."
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        do_sample=True,
        temperature=0.1
    )

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

# Testing using VLLM

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["NCCL_CUMEM_ENABLE"] = "1"

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)


In [None]:
from vllm import LLM, SamplingParams
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="facebook/opt-125m")
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


In [None]:
from vllm import LLM, SamplingParams
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="meta-llama/Llama-3.2-3B-Instruct")
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


In [None]:
def print_outputs(outputs):
    print("\nGenerated Outputs:\n" + "-" * 80)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}\n")
        print(f"Generated text: {generated_text!r}")
        print("-" * 80)

In [None]:
conversation = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Hello"},
    {"role": "assistant", "content": "Hello! How can I assist you today?"},
    {
        "role": "user",
        "content": "Write an essay about the importance of higher education.",
    },
]
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=300
)

outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
print_outputs(outputs)

# Reading data

In [3]:
import pandas as pd
data = pd.read_csv("benchmark_v1.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 17 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id                               1500 non-null   int64  
 1   id.1                             1500 non-null   int64  
 2   dataset                          1500 non-null   object 
 3   instruction                      1500 non-null   object 
 4   code                             1500 non-null   object 
 5   test                             600 non-null    object 
 6   relevant_categories              1500 non-null   object 
 7   simplified_instruction           1500 non-null   object 
 8   extracted_constraints            1500 non-null   object 
 9   final_comprehensive_constraints  1500 non-null   object 
 10  filtered_relevant_constraints    1500 non-null   object 
 11  quality_scores                   1500 non-null   object 
 12  relevance_score     

In [4]:
df = data.head(10).copy()

In [None]:
from tqdm import tqdm
def generate_responses(prompts, **sampling_kwargs):
    messages = [[{"role": "system", "content": "You are a helpful assistant"},{"role": "user", "content": prompt}] for prompt in prompts]
    print(messages)
    sampling_params = SamplingParams(**sampling_kwargs)
    outputs = llm.chat(messages, sampling_params, use_tqdm=True)
    # Extracting the text from the outputs
    final_outputs = [out.outputs[0].text for out in outputs if out.outputs]
    with open("vllm_outputs.txt", "w") as f:
        for output in final_outputs:
            print("Output started...................................................")
            print(output)
            print("output ended.....................................................")
    print("Outputs saved to vllm_outputs.txt")
    return final_outputs

In [None]:
all_outputs = []
batch_size = 8
for i in tqdm(range(0, len(df), batch_size), desc="Generating"):
    batch_prompts = df["combined_instruction"].iloc[i:i+batch_size].tolist()
    responses = generate_responses(
        batch_prompts,
        temperature=0,
        max_tokens=1024
    )
    all_outputs.extend(responses)
    print(f"Batch {i // batch_size + 1}/{(len(df) + batch_size - 1) // batch_size} processed.")
len(all_outputs)

In [None]:
conversation =[ [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Hello"},
    {"role": "assistant", "content": "Hello! How can I assist you today?"},
    {
        "role": "user",
        "content": "Write an essay about the importance of higher education.",
    },
],[
    {"role": "system", "content": "You are a helpful assistant"},
    {
        "role": "user",
        "content": "what is deeplearning",
    },
] ]

outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
print_outputs(outputs)

# Testing shanmukh's huggingfcae script

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
import torch

model_path="meta-llama/Llama-3.2-3B-Instruct"
device="cuda"
model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map=device,
        torch_dtype=torch.bfloat16,
    )
tokenizer = AutoTokenizer.from_pretrained(
        model_path
)

conv = [{"role": "user", "content":"Redesign a common household item to make it more sustainable and user-friendly. Explain the changes and their benefits."}]

input_ids = tokenizer.apply_chat_template(conv, return_tensors="pt", thinking=True, return_dict=True, add_generation_prompt=True).to(device)

set_seed(42)
output = model.generate(
    **input_ids,
    max_new_tokens=8192,
)

prediction = tokenizer.decode(output[0, input_ids["input_ids"].shape[1]:], skip_special_tokens=True)
print(prediction)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
import torch
from tqdm import tqdm

# Configuration
model_path = "meta-llama/Llama-3.2-3B-Instruct"
device = "cuda"
dtype = torch.bfloat16
batch_size = 4  # You can adjust this

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map=device,
    torch_dtype=dtype
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

# List of conversations (each with system/user messages)
conversations = [
    [{"role": "user", "content": "Redesign a common household item to make it more sustainable and user-friendly. Explain the changes and their benefits."}],
    [{"role": "user", "content": "What are the economic benefits of electric vehicles?"}],
    [{"role": "user", "content": "Explain quantum computing in simple terms."}],
    [{"role": "user", "content": "How does photosynthesis work in plants?"}]
]

# Set seed for reproducibility
set_seed(42)

# Collect predictions
all_predictions = []

# Process in batches
for i in tqdm(range(0, len(conversations), batch_size), desc="Generating"):
    batch = conversations[i:i + batch_size]

    # Tokenize all conversations in the batch
    inputs = tokenizer.apply_chat_template(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        return_dict=True,
        add_generation_prompt=True
    ).to(device)

    # Generate responses
    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,  # Adjust as needed
        do_sample=True,
        top_p=0.9,
        temperature=0.7
    )

    # Decode only the generated part (skip input tokens)
    for j in range(len(batch)):
        generated_tokens = outputs[j][inputs["input_ids"].shape[1]:]
        prediction = tokenizer.decode(generated_tokens, skip_special_tokens=True)
        all_predictions.append(prediction)

# Print all predictions
for i, pred in enumerate(all_predictions):
    print(f"\n--- Response {i+1} ---\n{pred}")


In [None]:
# -------- HuggingFace Client (Batched) --------
class HuggingFaceClient:
    def __init__(self, model_name, hf_token, device="cuda"):
        import torch
        from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed, logging
        logging.set_verbosity_error()
        self.torch = torch
        self.set_seed = set_seed
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map=self.device,
            torch_dtype=torch.bfloat16,
            token=hf_token
        )
        self.model.eval()
        self.set_seed(42)

    def generate_responses(self, prompts, **sampling_kwargs):
        batch_size = sampling_kwargs.get("batch_size", 4)
        max_new_tokens = sampling_kwargs.get("max_tokens", 1024)
        temperature = sampling_kwargs.get("temperature", 0.7)
        top_p = sampling_kwargs.get("top_p", 0.9)

        all_predictions = []
        conversations = [[{"role": "user", "content": prompt}] for prompt in prompts]

        for i in tqdm(range(0, len(conversations), batch_size), desc="HF Generating"):
            batch = conversations[i:i + batch_size]

            inputs = self.tokenizer.apply_chat_template(
                batch,
                return_tensors="pt",
                padding=True,
                truncation=True,
                return_dict=True,
                add_generation_prompt=True
            ).to(self.device)

            with self.torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=True,
                    temperature=temperature,
                    top_p=top_p
                )

            for j in range(len(batch)):
                input_len = inputs["input_ids"].shape[1]
                generated_tokens = outputs[j][input_len:]
                prediction = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
                all_predictions.append(prediction)

        return all_predictions

In [None]:

hf_token =""
model_name = "meta-llama/Llama-3.2-3B-Instruct"
batch_size = 8
all_outputs=[]
client = HuggingFaceClient(model_name=model_name, hf_token=hf_token)
for i in tqdm(range(0, len(prompts), batch_size), desc="Batching"):
        batch_prompts = prompts[i:i+batch_size]
        responses = client.generate_responses(
            batch_prompts,
            temperature=0.7,
            max_tokens=1024,
            batch_size=8
        )
        all_outputs.extend(responses)
for output in all_outputs:
    print("Output :\n")
    print(output)

In [None]:
print(len(all_outputs))

## openai testing 

In [5]:
# -------- OpenAI Client (Concurrent Inference) --------
class OpenAIClient:
    def __init__(self, model_name, openai_key, max_workers=100):
        from openai import OpenAI
        import time
        self.client = OpenAI(api_key=openai_key)
        self.model_name = model_name
        self.time = time
        self.max_workers = max_workers
        self.SYSTEM_PROMPT = "You are a helpful assistant."

    def get_response_v2(self, messages, max_retries=1):
        for attempt in range(max_retries):
            try:
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=messages,
                    temperature=0
                )
                return response.choices[0].message.content
            except Exception as e:
                print(f"Error on attempt {attempt + 1}: {e}")
                self.time.sleep(2)
        return "[Error]"

    def generate_responses(self, prompts, **kwargs):
        from concurrent.futures import ThreadPoolExecutor
        from tqdm import tqdm

        # Prepare messages with system prompt
        messages_list = [[
            {"role": "system", "content": self.SYSTEM_PROMPT},
            {"role": "user", "content": prompt}
        ] for prompt in prompts]

        # Run inference concurrently
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            results = list(tqdm(
                executor.map(lambda m: self.get_response_v2(m), messages_list),
                total=len(messages_list),
                desc="OpenAI Generating"
            ))

        return results


In [None]:
client = OpenAIClient(model_name=model_name, openai_key=openai_key)


In [8]:
from tqdm import tqdm
prompts = df["combined_instruction"].tolist()
model_name = "meta-llama/Llama-3.2-3B-Instruct"
batch_size = 8
all_outputs=[]
for i in tqdm(range(0, len(prompts), batch_size), desc="Batching"):
        batch_prompts = prompts[i:i+batch_size]
        responses = client.generate_responses(
            batch_prompts,
            temperature=0.7,
            max_tokens=1024,
            batch_size=8
        )
        all_outputs.extend(responses)
for output in all_outputs:
    print("Output :\n")
    print(output)

Batching:   0%|                                                                                    | 0/2 [00:00<?, ?it/s]
OpenAI Generating:   0%|                                                                           | 0/8 [00:00<?, ?it/s][A

Error on attempt 1: Error code: 400 - {'error': {'message': 'invalid model ID', 'type': 'invalid_request_error', 'param': None, 'code': None}}
Error on attempt 1: Error code: 400 - {'error': {'message': 'invalid model ID', 'type': 'invalid_request_error', 'param': None, 'code': None}}
Error on attempt 1: Error code: 400 - {'error': {'message': 'invalid model ID', 'type': 'invalid_request_error', 'param': None, 'code': None}}
Error on attempt 1: Error code: 400 - {'error': {'message': 'invalid model ID', 'type': 'invalid_request_error', 'param': None, 'code': None}}
Error on attempt 1: Error code: 400 - {'error': {'message': 'invalid model ID', 'type': 'invalid_request_error', 'param': None, 'code': None}}
Error on attempt 1: Error code: 400 - {'error': {'message': 'invalid model ID', 'type': 'invalid_request_error', 'param': None, 'code': None}}
Error on attempt 1: Error code: 400 - {'error': {'message': 'invalid model ID', 'type': 'invalid_request_error', 'param': None, 'code': None}}


OpenAI Generating:  12%|████████▍                                                          | 1/8 [00:06<00:42,  6.08s/it][A
OpenAI Generating: 100%|███████████████████████████████████████████████████████████████████| 8/8 [00:06<00:00,  1.21it/s][A
Batching:  50%|██████████████████████████████████████                                      | 1/2 [00:06<00:06,  6.64s/it]
OpenAI Generating:   0%|                                                                           | 0/2 [00:00<?, ?it/s][A

Error on attempt 1: Error code: 400 - {'error': {'message': 'invalid model ID', 'type': 'invalid_request_error', 'param': None, 'code': None}}
Error on attempt 1: Error code: 400 - {'error': {'message': 'invalid model ID', 'type': 'invalid_request_error', 'param': None, 'code': None}}



OpenAI Generating: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.17s/it][A
Batching: 100%|████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.50s/it]

Output :

[Error]
Output :

[Error]
Output :

[Error]
Output :

[Error]
Output :

[Error]
Output :

[Error]
Output :

[Error]
Output :

[Error]
Output :

[Error]
Output :

[Error]





# Rits testing 


In [2]:
from openai import OpenAI

# from transformers import AutoTokenizer
from ratelimit import limits, sleep_and_retry
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

NUM_CALLS_PER_MIN = 1500


class LLMClient:

    def __init__(self, api_key,model_id,client_type="rits", base_url=None):
        if client_type == "rits":
            llm = OpenAI(
                api_key=api_key,
                base_url=base_url,
                default_headers={"RITS_API_KEY": api_key},
            )
        elif client_type == "GPT":
            llm = OpenAI(
                api_key=api_key)
        self.llm = llm
        self.model_id = model_id
        # self.tokenizer = AutoTokenizer.from_pretrained(model_id)

    def get_model_response(
        self,
        messages=None,
        system_prompt=None,
        user_prompt=None,
        max_new_tokens=1024,
        temperature=0.1
    ):
        # Setup the sampling parameters for generation
        if messages is None:
            if system_prompt:
                messages = [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ]
            else:
                messages = [{"role": "user", "content": user_prompt}]

        response = self.llm.chat.completions.create(
            model=self.model_id,
            messages=messages,
            max_tokens=max_new_tokens,
            temperature=temperature,
        )

        return response.choices[0].message.content.strip()

    def apply_chat_template(self, messages_list):
        prompts = [
            self.tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            for messages in messages_list
        ]
        return prompts

    def get_model_response_batch(
        self, system_prompt=None, user_prompts=None, max_new_tokens=1024, temperature=0.1
    ):
        non_none_user_prompts = [ele for ele in user_prompts if ele is not None]
        if system_prompt:
            messages_list = [
                [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ]
                for user_prompt in non_none_user_prompts
            ]
        else:
            messages_list = [
                [{"role": "user", "content": user_prompt}]
                for user_prompt in non_none_user_prompts
            ]
        with ThreadPoolExecutor(max_workers=NUM_CALLS_PER_MIN) as executor:
            response_texts = list(
                tqdm(
                    executor.map(
                        lambda messages: self.call_api(
                            messages, max_new_tokens, temperature
                        ),
                        messages_list,
                    ),
                    total=len(messages_list),
                    desc="Processing",
                )
            )
        response_iter = iter(response_texts)
        all_response_texts = [
            next(response_iter) if ele is not None else None for ele in user_prompts
        ]
        return all_response_texts

    @sleep_and_retry
    @limits(calls=1500, period=60)
    def call_api(self, messages, max_new_tokens, temperature):
        response = self.get_model_response(
            messages=messages, max_new_tokens=max_new_tokens, temperature=temperature
        )
        return response


In [5]:
import argparse
import pandas as pd
import json
from tqdm import tqdm
import os
import dotenv
dotenv.load_dotenv()
random_state = 42

def main(input_csv, output_dir, api_key, model_id, base_url, temperature=0.1, system_prompt=None):
    df_ini = pd.read_csv(input_csv)
    df = df_ini.sample(10,random_state=random_state).copy()  # Limit to first 10 rows for testing

    user_prompts = df["combined_instruction"].tolist()
    model_name = model_id.split("/")[-1]
    output_path = os.path.join(output_dir, f"{model_name}_results.jsonl")
    client = LLMClient(
        api_key=api_key,
        model_id=model_id,
        client_type="rits",
        base_url=base_url
    )

    print(f"Generating responses for {len(user_prompts)} prompts.")
    responses = client.get_model_response_batch(
        system_prompt=system_prompt,
        user_prompts=user_prompts,
        temperature=temperature

    )

    output_data = []
    for row, response in zip(df.to_dict(orient="records"), responses):
        row["response"] = response
        print(response)
        output_data.append(row)


    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        for entry in output_data:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"Saved output to: {output_path}")

if __name__ == "__main__":
    input_csv = "benchmark_v4.csv"
    output_dir = "."
    api_key = os.getenv("RITS_API_KEY")
    model_id = "meta-llama/Llama-3.1-8B-Instruct"
    print(api_key)
    base_url = "https://inference-3scale-apicast-production.apps.rits.fmaas.res.ibm.com/llama-3-1-8b-instruct/v1"
    temperature = 0.1
    
    main(
        input_csv=input_csv,
        output_dir=output_dir,
        api_key=api_key,
        model_id=model_id,
        base_url=base_url,
        temperature=temperature
    )


48032bfa1e1d782cece12fedb6b3fb40
Generating responses for 10 prompts.


Processing: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:13<00:00,  1.31s/it]

```python
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.signal import get_window

def task_func(amplitude, frequency, time):
    """
    Generates and plots a complex wave with a specified amplitude and frequency over given time points,
    applying a Hann window to reduce edge effects.

    Parameters:
    amplitude (float): The amplitude of the complex wave.
    frequency (float): The frequency of the complex wave.
    time (numpy.ndarray): The time points at which the complex wave is generated.

    Returns:
    numpy.ndarray: The generated complex wave as a numpy array of complex numbers.
    matplotlib.figure.Figure: The figure object of the plot.
    matplotlib.axes.Axes: The axes object of the plot.
    """

    # Generate the complex wave using the formula wave = amplitude * np.exp(1j * 2 * math.pi * frequency * time)
    wave = amplitude * np.exp(1j * 2 * math.pi * frequency * time)

    # Apply a Hann window to the wave to reduce edge effects
    h


