In [1]:
import pandas as pd
from transformers import pipeline
from datasets import load_dataset
import time
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset('mlburnham/Pol_NLI')
test = ds['test'].to_pandas()
ndocs = 5000
test = test.sample(ndocs, random_state = 1)

# NLI Base RTX 3090

In [6]:
model = "mlburnham/Political_DEBATE_base_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = 0, batch_size = 32)

In [7]:
start_time = time.time()
res = pipe(list(test['premise']), 'This text is about politics.', hypothesis_template='{}', multi_label=False)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"DPS: {ndocs/elapsed_time}")

Elapsed time: 62.97 seconds


In [23]:
ndocs/elapsed_time

55.2791597567717

# NLI Base Ryzen 5900x

In [32]:
model = "mlburnham/Political_DEBATE_large_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = 'cpu', batch_size = 32)

In [None]:
start_time = time.time()
res = pipe(list(test['premise']), 'This text is about politics.', hypothesis_template='{}', multi_label=False)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"DPS: {ndocs/elapsed_time}")

In [35]:
ndocs/elapsed_time

8.845312497152344

# NLI Large RTX 3090

In [8]:
model = "mlburnham/Political_DEBATE_large_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = 0, batch_size = 32)

In [9]:
start_time = time.time()
res = pipe(list(test['premise']), 'This text is about politics.', hypothesis_template='{}', multi_label=False)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"DPS: {ndocs/elapsed_time}")

Elapsed time: 1767.01 seconds


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[colname].replace({0: 1, 1: 0}, inplace=True)


In [10]:
elapsed_time/60

29.450202425320942

In [12]:
ndocs/elapsed_time

2.8296353325464514

# NLI Large Ryzen 5900x

In [None]:
model = "mlburnham/Political_DEBATE_large_v1.0"
pipe = pipeline("zero-shot-classification", model = model, device = 'cpu', batch_size = 32)

In [12]:
start_time = time.time()
res = pipe(list(test['premise']), 'This text is about politics.', hypothesis_template='{}', multi_label=False)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"DPS: {ndocs/elapsed_time}")

Elapsed time: 17.78 seconds


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[colname].replace({0: 1, 1: 0}, inplace=True)


# Llama-3.1

In [3]:
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
pipe = pipeline("text-generation", model=model, model_kwargs={"torch_dtype": torch.bfloat16}, device_map=0, batch_size = 1)

Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.28s/it]


In [4]:
user_message = """You are a classifier that can only respond with 1 or 0. I'm going to show you a short text sample and I want you to determine if this text is about politics. Here is the text:
{doc}

If it is true that this text is about politics, return 1. If it is not true that this text is about politics, return 0.
Do not explain your answer, and only return 1 or 0.
"""

In [5]:
messages = [{"role": "user", "content": user_message.format(doc = doc)} for doc in test['premise']]

In [6]:
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [7]:
start_time = time.time()
outputs = pipe(prompt, max_new_tokens=2, do_sample=False, return_full_text = False, pad_token_id=pipe.tokenizer.eos_token_id, temperature = 0)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")

Token indices sequence length is longer than the specified maximum sequence length for this model (732887 > 131072). Running this sequence through the model will result in indexing errors


OutOfMemoryError: CUDA out of memory. Tried to allocate 5.59 GiB. GPU 0 has a total capacity of 24.00 GiB of which 0 bytes is free. Of the allocated memory 46.10 GiB is allocated by PyTorch, and 2.94 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [16]:
data = test
res = []

start_time = time.time()
for doc in data['premise']:
    messages = [
        {"role": "user", "content": user_message.format(doc = doc, hypothesis = hypothesis)},
    ]
    prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=2, do_sample=False, return_full_text = False, pad_token_id=pipe.tokenizer.eos_token_id, temperature = 0)
    res.extend(outputs)
end_time = time.time()

elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")

res = [text['generated_text'] for text in res]
# return a list of unique responses from the model
print(set(res))

  attn_output = torch.nn.functional.scaled_dot_product_attention(
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Elapsed time: 517.61 seconds
{'0', '1'}
