In [19]:
import pandas as pd
from transformers import pipeline
from datasets import load_dataset
import time
import numpy as np
import torch # pipeline will claim to be using mps w/o this but torch must be imported otherwise it falls back to cpu
# Make the sure accelerate library is installed as well.

In [2]:
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())
print(torch.__version__)

True
True
2.5.1


In [3]:
ds = load_dataset('mlburnham/Pol_NLI')
test = ds['test'].to_pandas()
ndocs = 5000
test = test.sample(ndocs, random_state = 1)
timings = []

In [None]:
def time_it(pipe, docs, n=1):
    """
    Benchmark the time taken by a model pipeline.

    Args:
        pipe (callable): The model pipeline to benchmark.
        docs (list): A list of documents that will be passed to the pipe.
        n (int): Number of times to run the benchmark. Defaults to 1.

    Returns:
        dict: A dictionary containing the model name, hardware, average time, DPS (documents per second),
              and standard errors for both metrics if n > 1.
    """
    times = []

    for i in range(n):
        # Start the timer
        start_time = time.time()
        results = pipe(docs, 'This text is about politics.', hypothesis_template='{}')
        # Stop timer
        end_time = time.time()
        # Calculate the elapsed time for this run
        elapsed_time = end_time - start_time
        times.append(elapsed_time)

        print(f"Run {i + 1}/{n} - Elapsed time: {elapsed_time:.2f} seconds")

    # Calculate the average time and DPS
    avg_time = np.mean(times)
    dps = ndocs / avg_time

    # Calculate standard errors if n > 1
    if n > 1:
        time_se = np.std(times, ddof=1) / np.sqrt(n)
        dps_se = (np.std([ndocs / t for t in times], ddof=1) / np.sqrt(n))
    else:
        time_se = None
        dps_se = None

    print(f"Average elapsed time: {avg_time:.2f} seconds")
    print(f"Average DPS: {dps}")

    if n > 1:
        print(f"Standard error (Time): {time_se:.4f} seconds")
        print(f"Standard error (DPS): {dps_se:.4f}")

    torch.mps.empty_cache()

    res = {
        'Model': model.split('/')[-1],
        'Hardware': pipe.device.type,
        'Time': avg_time,
        'Time_SE': time_se,
        'DPS': dps,
        'DPS_SE': dps_se
    }
    return res

# M3 Max Base DeBERTa

In [16]:
model = "mlburnham/Political_DEBATE_DeBERTa_base_v1.1"
pipe = pipeline("zero-shot-classification", model = model, device = torch.device("mps"), batch_size = 8, torch_dtype = torch.bfloat16)

Device set to use mps


In [20]:
# once to compile
time_it(pipe, n = 1)
# benchmark
res = time_it(pipe, n = 10)
timings.append(res)

Run 1/1 - Elapsed time: 32.74 seconds
Average elapsed time: 32.74 seconds
Average DPS: 152.71852848907307
Run 1/10 - Elapsed time: 32.31 seconds
Run 2/10 - Elapsed time: 32.26 seconds
Run 3/10 - Elapsed time: 32.44 seconds
Run 4/10 - Elapsed time: 32.51 seconds
Run 5/10 - Elapsed time: 32.26 seconds
Run 6/10 - Elapsed time: 32.58 seconds
Run 7/10 - Elapsed time: 32.84 seconds
Run 8/10 - Elapsed time: 32.26 seconds
Run 9/10 - Elapsed time: 32.17 seconds
Run 10/10 - Elapsed time: 32.61 seconds
Average elapsed time: 32.42 seconds
Average DPS: 154.20780747542545
Standard error (Time): 0.0658 seconds
Standard error (DPS): 0.3118


# M3 Max Large DeBERTa

In [25]:
model = "mlburnham/Political_DEBATE_large_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = torch.device("mps"), batch_size = 8, torch_dtype = torch.bfloat16)

Device set to use mps


In [26]:
# once to compile
time_it(pipe, n = 1)
# benchmark
res = time_it(pipe, n = 10)
timings.append(res)

Run 1/1 - Elapsed time: 176.31 seconds
Average elapsed time: 176.31 seconds
Average DPS: 28.359860299473564
Run 1/10 - Elapsed time: 81.72 seconds
Run 2/10 - Elapsed time: 82.07 seconds
Run 3/10 - Elapsed time: 83.57 seconds
Run 4/10 - Elapsed time: 82.88 seconds
Run 5/10 - Elapsed time: 82.90 seconds
Run 6/10 - Elapsed time: 83.50 seconds
Run 7/10 - Elapsed time: 83.54 seconds
Run 8/10 - Elapsed time: 82.11 seconds
Run 9/10 - Elapsed time: 82.02 seconds
Run 10/10 - Elapsed time: 82.37 seconds
Average elapsed time: 82.67 seconds
Average DPS: 60.48218526077912
Standard error (Time): 0.2220 seconds
Standard error (DPS): 0.1622


# M3 Max Base ModernBERT

In [22]:
model = "mlburnham/Political_DEBATE_ModernBERT_base_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = torch.device("mps"), batch_size = 8, torch_dtype = torch.bfloat16)

Device set to use mps


In [23]:
# once to compile
time_it(pipe, n = 1)
# benchmark
res = time_it(pipe, n = 10)
timings.append(res)

Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.


Run 1/1 - Elapsed time: 170.65 seconds
Average elapsed time: 170.65 seconds
Average DPS: 29.299827449098625
Run 1/10 - Elapsed time: 41.60 seconds
Run 2/10 - Elapsed time: 42.50 seconds
Run 3/10 - Elapsed time: 42.04 seconds
Run 4/10 - Elapsed time: 42.42 seconds
Run 5/10 - Elapsed time: 42.71 seconds
Run 6/10 - Elapsed time: 42.37 seconds
Run 7/10 - Elapsed time: 42.45 seconds
Run 8/10 - Elapsed time: 42.35 seconds
Run 9/10 - Elapsed time: 42.32 seconds
Run 10/10 - Elapsed time: 41.62 seconds
Average elapsed time: 42.24 seconds
Average DPS: 118.37369231929557
Standard error (Time): 0.1170 seconds
Standard error (DPS): 0.3303


# M3 Max Large ModernBERT

In [27]:
model = "mlburnham/Political_DEBATE_ModernBERT_large_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = torch.device("mps"), batch_size = 8, torch_dtype = torch.bfloat16)

Device set to use mps


In [29]:
# once to compile
time_it(pipe, n = 1)
# benchmark
res = time_it(pipe, n = 10)
timings.append(res)

Run 1/1 - Elapsed time: 82.01 seconds
Average elapsed time: 82.01 seconds
Average DPS: 60.968216981768585
Run 1/10 - Elapsed time: 82.49 seconds
Run 2/10 - Elapsed time: 83.51 seconds
Run 3/10 - Elapsed time: 85.91 seconds
Run 4/10 - Elapsed time: 86.97 seconds
Run 5/10 - Elapsed time: 86.82 seconds
Run 6/10 - Elapsed time: 86.67 seconds
Run 7/10 - Elapsed time: 86.37 seconds
Run 8/10 - Elapsed time: 86.32 seconds
Run 9/10 - Elapsed time: 86.33 seconds
Run 10/10 - Elapsed time: 86.49 seconds
Average elapsed time: 85.79 seconds
Average DPS: 58.28415983234997
Standard error (Time): 0.4798 seconds
Standard error (DPS): 0.3342


# M3 Max Base ModernBERT

In [10]:
model = "mlburnham/Political_DEBATE_base_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = torch.device("mps"), batch_size = 32, torch_dtype = torch.float16)

In [20]:
# once to compile
time_it(pipe, n = 1)
# benchmark
res = time_it(pipe, n = 10)
timings.append(res)

Run 1/1 - Elapsed time: 32.74 seconds
Average elapsed time: 32.74 seconds
Average DPS: 152.71852848907307
Run 1/10 - Elapsed time: 32.31 seconds
Run 2/10 - Elapsed time: 32.26 seconds
Run 3/10 - Elapsed time: 32.44 seconds
Run 4/10 - Elapsed time: 32.51 seconds
Run 5/10 - Elapsed time: 32.26 seconds
Run 6/10 - Elapsed time: 32.58 seconds
Run 7/10 - Elapsed time: 32.84 seconds
Run 8/10 - Elapsed time: 32.26 seconds
Run 9/10 - Elapsed time: 32.17 seconds
Run 10/10 - Elapsed time: 32.61 seconds
Average elapsed time: 32.42 seconds
Average DPS: 154.20780747542545
Standard error (Time): 0.0658 seconds
Standard error (DPS): 0.3118


# Llama 3.1

In [32]:
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
pipe = pipeline("text-generation", model=model, model_kwargs={"torch_dtype": torch.bfloat16}, device_map='mps', batch_size = 1,
token = "########")
pipe.tokenizer.pad_token_id = pipe.model.config.eos_token_id[0]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use mps


In [33]:
user_message = """You are a classifier that can only respond with 1 or 0. I'm going to show you a short text sample and I want you to determine if this text is about politics. Here is the text:
{doc}

If it is true that this text is about politics, return 1. If it is not true that this text is about politics, return 0.
Do not explain your answer, and only return 1 or 0.
"""

In [34]:
messages = [{"role": "user", "content": user_message.format(doc = doc)} for doc in test['premise']]

In [35]:
prompt = [pipe.tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True) for message in messages]

In [36]:
torch.mps.empty_cache()
# Start the timer
start_time = time.time()
results = pipe(prompt, max_new_tokens=2, do_sample=False, return_full_text = False, pad_token_id=pipe.tokenizer.pad_token_id, temperature = 0)
# Stop timer
end_time = time.time()
# Calculate the elapsed time
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"DPS: {ndocs/elapsed_time}")
torch.mps.empty_cache()

timings.append({
                'Model': model.split('/')[-1],
                'Hardware': 'mps',
                'Time': elapsed_time,
                'DPS': ndocs/elapsed_time
            })

Elapsed time: 2219.90 seconds
DPS: 2.252349324300326


# Export

In [52]:
pipe.device.type

'mps'

In [46]:
mps = pd.DataFrame(timings)
mps.to_csv('../data/mps_timing.csv', index = False)