1. Library needed for model and datasets

In [None]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from huggingface_hub import login
TOKEN = "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"
login(token = TOKEN)

2. get pretrained model and tokenize

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2",token = TOKEN).to(device) 
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2",token = TOKEN)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer)) 

3. load  datasets

In [40]:
from datasets import load_dataset

# If the dataset is gated/private, make sure you have run huggingface-cli login
dataset = load_dataset("TheFinAI/flare-cd",token = TOKEN)

4. preprocess datasets

In [41]:
print(dataset.column_names)
def preprocess_function(examples):
    combined_texts = [q + " [SEP] " + t for q, t in zip(examples['query'], examples['text'])]
    return tokenizer(combined_texts, truncation=True, padding="max_length", max_length=512)

{'test': ['id', 'query', 'answer', 'text', 'label', 'token']}


In [42]:
processed_dataset = dataset.map(preprocess_function, batched=True)
print(processed_dataset['test'][0:2])
input_ids = processed_dataset['test']['input_ids']
attention_mask = processed_dataset['test']['attention_mask']

{'id': ['cd0', 'cd1'], 'query': ["Your job in this task is to perform sequence labeling on a provided text section, marking the chunks that represent the cause of an event and the effects that result from it. For each token in the text, assign a label to indicate its role in representing cause or effect. The labels you should use are 'B-CAUSE', 'I-CAUSE', 'B-EFFECT', 'I-EFFECT', and 'O'. A 'B-' prefix is used to denote the beginning of a cause or effect sequence, while an 'I-' prefix is used for continuation of a cause or effect sequence. If a token is not part of either a cause or effect sequence, label it as 'O'. Provide your answer as a sequence of 'token:label' pairs, with each pair on a new line.\nText: Around 21,000 employees , 9,000 of whom are employed in the UK , are to be made redundant after the 178-year-old company ceased trading and went into compulsory liquidation this morning.\nAnswer:", "Your job in this task is to perform sequence labeling on a provided text section, m

5. define testing indicator

In [43]:

from sklearn.metrics import f1_score

def compute_f1(predictions, references):
    f1 = f1_score(references, predictions, average='weighted')
    return f1

6. run the benchmark

In [46]:
import torch, time

inputs = torch.tensor(input_ids)
masks = torch.tensor(attention_mask)

def collate_fn(batch):

    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    
    input_ids = [torch.tensor(ids) for ids in input_ids]
    attention_masks = [torch.tensor(mask) for mask in attention_masks]
    
    input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_masks_padded = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
    
    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_masks_padded
    }

In [47]:

data = [{'input_ids': ids, 'attention_mask': mask} for ids, mask in zip(input_ids, attention_mask)]
loader = torch.utils.data.DataLoader(data, batch_size=10, collate_fn=collate_fn, shuffle=False)
start_time = time.time()
for batch in loader:
    inputs = batch['input_ids']
    masks = batch['attention_mask']
    with torch.no_grad():
        outputs = model(inputs, attention_mask=masks)
        predictions = torch.argmax(outputs.logits, dim=-1)
    print(predictions)
    
end_time = time.time()
elapsed_time = end_time - start_time


KeyboardInterrupt: 

7. analyze and report results

In [None]:
print(f"Elapsed time for inference: {elapsed_time} seconds")

actual_labels = dataset['train']['gold'] 
f1 = compute_f1(predictions.numpy(), actual_labels)

print(f"F1 Score: {f1}")