In [1]:
import pandas as pd
from transformers import pipeline
from datasets import load_dataset
import time
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset('mlburnham/Pol_NLI')
test = ds['test'].to_pandas()
ndocs = 5000
test = test.sample(ndocs, random_state = 1)
timings = []

In [None]:
def time_it(pipe, docs, n=1):
    """
    Benchmark the time taken by a model pipeline.

    Args:
        pipe (callable): The model pipeline to benchmark.
        docs (list): A list of documents that will be passed to the pipe.
        n (int): Number of times to run the benchmark. Defaults to 1.

    Returns:
        dict: A dictionary containing the model name, hardware, average time, DPS (documents per second),
              and standard errors for both metrics if n > 1.
    """
    times = []

    for i in range(n):
        # Start the timer
        start_time = time.time()
        results = pipe(docs, 'This text is about politics.', hypothesis_template='{}')
        # Stop timer
        end_time = time.time()
        # Calculate the elapsed time for this run
        elapsed_time = end_time - start_time
        times.append(elapsed_time)

        print(f"Run {i + 1}/{n} - Elapsed time: {elapsed_time:.2f} seconds")

    # Calculate the average time and DPS
    avg_time = np.mean(times)
    dps = ndocs / avg_time

    # Calculate standard errors if n > 1
    if n > 1:
        time_se = np.std(times, ddof=1) / np.sqrt(n)
        dps_se = (np.std([ndocs / t for t in times], ddof=1) / np.sqrt(n))
    else:
        time_se = None
        dps_se = None

    print(f"Average elapsed time: {avg_time:.2f} seconds")
    print(f"Average DPS: {dps}")

    if n > 1:
        print(f"Standard error (Time): {time_se:.4f} seconds")
        print(f"Standard error (DPS): {dps_se:.4f}")

    torch.mps.empty_cache()

    res = {
        'Model': model.split('/')[-1],
        'Hardware': pipe.device.type,
        'Time': avg_time,
        'Time_SE': time_se,
        'DPS': dps,
        'DPS_SE': dps_se
    }
    return res

# DEBATE Base DeBERTa RTX 3090

In [6]:
model = "mlburnham/Political_DEBATE_base_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = 'cuda', batch_size = 32)

In [None]:
# once to compile
time_it(pipe, n = 1)
# benchmark
res = time_it(pipe, n = 10)
timings.append(res)

# DEBATE Base DeBERTa Ryzen 9900x

In [32]:
model = "mlburnham/Political_DEBATE_large_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = 'cpu', batch_size = 32)

In [None]:
# once to compile
time_it(pipe, n = 1)
# benchmark
res = time_it(pipe, n = 10)
timings.append(res)

# DEBATE Large DeBERTa RTX 3090

In [8]:
model = "mlburnham/Political_DEBATE_large_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = 'cuda', batch_size = 32)

In [None]:
# once to compile
time_it(pipe, n = 1)
# benchmark
res = time_it(pipe, n = 10)
timings.append(res)

# DEBATE Large DeBERTa Ryzen 9900x

In [None]:
model = "mlburnham/Political_DEBATE_large_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = 'cpu', batch_size = 32)

In [None]:
# once to compile
time_it(pipe, n = 1)
# benchmark
res = time_it(pipe, n = 10)
timings.append(res)

# DEBATE Large Modern BERT RTX 3090

In [None]:
model = "mlburnham/Political_DEBATE_ModernBERT_large_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = 'cuda', batch_size = 8, torch_dtype = torch.bfloat16)

In [None]:
# once to compile
time_it(pipe, n = 1)
# benchmark
res = time_it(pipe, n = 10)
timings.append(res)

# DEBATE Large Modern BERT Ryzen 9900x

In [None]:
model = "mlburnham/Political_DEBATE_ModernBERT_large_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = 'cpu', batch_size = 8, torch_dtype = torch.bfloat16)

In [None]:
# once to compile
time_it(pipe, n = 1)
# benchmark
res = time_it(pipe, n = 10)
timings.append(res)

# DEBATE Base Modern BERT RTX 3090

In [None]:
model = "mlburnham/Political_DEBATE_ModernBERT_base_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = 'cuda', batch_size = 8, torch_dtype = torch.bfloat16)

In [None]:
# once to compile
time_it(pipe, n = 1)
# benchmark
res = time_it(pipe, n = 10)
timings.append(res)

# DEBATE Base Modern BERT Ryzen 9900x

In [None]:
model = "mlburnham/Political_DEBATE_ModernBERT_base_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = 'cpu', batch_size = 8, torch_dtype = torch.bfloat16)

In [None]:
# once to compile
time_it(pipe, n = 1)
# benchmark
res = time_it(pipe, n = 10)
timings.append(res)

# Llama-3.1

In [32]:
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
pipe = pipeline("text-generation", model=model, model_kwargs={"torch_dtype": torch.bfloat16}, device_map='cuda', batch_size = 1,
token = "########")
pipe.tokenizer.pad_token_id = pipe.model.config.eos_token_id[0]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use mps


In [33]:
user_message = """You are a classifier that can only respond with 1 or 0. I'm going to show you a short text sample and I want you to determine if this text is about politics. Here is the text:
{doc}

If it is true that this text is about politics, return 1. If it is not true that this text is about politics, return 0.
Do not explain your answer, and only return 1 or 0.
"""

In [34]:
messages = [{"role": "user", "content": user_message.format(doc = doc)} for doc in test['premise']]

In [35]:
prompt = [pipe.tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True) for message in messages]

In [36]:
torch.mps.empty_cache()
# Start the timer
start_time = time.time()
results = pipe(prompt, max_new_tokens=2, do_sample=False, return_full_text = False, pad_token_id=pipe.tokenizer.pad_token_id, temperature = 0)
# Stop timer
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"DPS: {ndocs/elapsed_time}")
torch.mps.empty_cache()

timings.append({
                'Model': model.split('/')[-1],
                'Hardware': 'cuda',
                'Time': elapsed_time,
                'DPS': ndocs/elapsed_time
            })

Elapsed time: 2219.90 seconds
DPS: 2.252349324300326


# Export

In [46]:
mps = pd.DataFrame(timings)
mps.to_csv('../data/cuda_timing.csv', index = False)