In [1]:
import time
import torch
import transformers
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import f1_score
from utils import html_parsing_ncbi, html_parsing_n2c2, get_classification_report, get_digit, get_macro_average_f1

model_id = "meta-llama/Meta-Llama-3-70B-Instruct"
# This will take 45GB of GPU memory loading in 4-bit precision
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.float16, 
                  "attn_implementation": "flash_attention_2",
                  "quantization_config": {"load_in_4bit": True, "bnb_4bit_compute_dtype": torch.float16},
                  "low_cpu_mem_usage": True},
    token="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
)

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 1. NER (Named Entity Recognition)

## 1.1 NCBI-Disease Dataset

### 1.1.1 Inference

In [2]:
ncbi_df = pd.read_csv('data/NER/NCBI-disease/test_200.csv')
ncbi_example_df = pd.read_csv('data/NER/NCBI-disease/examples.csv')

In [3]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to extract disease entities in a sentence."
"INPUT: the input is a sentence."
"OUTPUT: the output is an HTML that highlights all the disease entities in the sentence. The highlighting should only use HTML tags <span style=\"background-color: #FFFF00\"> and </span> and no other tags."
"""

def get_ner_ncbi_disease(sentence: str, shot: int = 0) -> str:
    """
    Get the NER results of NCBI-disease dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the NER results
    """
    messages = [
        {"role": "system", "content": system_message},
    ]
    for i in range(shot):
        messages.append({"role": "user", "content": ncbi_example_df. iloc[i]['text']}) 
        messages.append({"role": "assistant", "content": ncbi_example_df.iloc[i]['label_text']})
    messages.append({"role": "user", "content": sentence})
    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    time_start = time.time()
    outputs = pipeline(
        prompt,
        max_new_tokens=2048,
        eos_token_id=terminators,
        do_sample=False,
        pad_token_id=pipeline.tokenizer.eos_token_id
    )
    time_end = time.time()

    return outputs[0]["generated_text"][len(prompt):], time_end - time_start

In [4]:
for i in tqdm(range(0, len(ncbi_df), 1)):
    ncbi_df.loc[i, 'html_llama3_70b_instruct_one_shot'], ncbi_df.loc[i, 'llama3_70b_instruct_one_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 1)
    ncbi_df.loc[i, 'html_llama3_70b_instruct_five_shot'], ncbi_df.loc[i, 'llama3_70b_instruct_five_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 5)
    ncbi_df.loc[i, 'html_llama3_70b_instruct_ten_shot'], ncbi_df.loc[i, 'llama3_70b_instruct_ten_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 10)
    ncbi_df.loc[i, 'html_llama3_70b_instruct_twenty_shot'], ncbi_df.loc[i, 'llama3_70b_instruct_twenty_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 20)

  1%|          | 2/200 [00:43<1:07:49, 20.55s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 200/200 [1:29:00<00:00, 26.70s/it]


### 1.1.2 Evaluation

In [48]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# ncbi_df = pd.read_csv("data/NER/NCBI-disease/test_200_llama3_70b_instruct_results.csv")

In [9]:
ncbi_df['gt_labels'], ncbi_df['llama3_70b_instruct_one_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_llama3_70b_instruct_one_shot')
_, ncbi_df['llama3_70b_instruct_five_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_llama3_70b_instruct_five_shot')
_, ncbi_df['llama3_70b_instruct_ten_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_llama3_70b_instruct_ten_shot')
_, ncbi_df['llama3_70b_instruct_twenty_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_llama3_70b_instruct_twenty_shot')

In [10]:
print(f"F1-Score One Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'llama3_70b_instruct_one_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1-Score Five Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'llama3_70b_instruct_five_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1-Score Ten Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'llama3_70b_instruct_ten_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1-Score Twenty Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'llama3_70b_instruct_twenty_shot_labels', 'strict')['default']['f1-score']}")

F1-Score One Shot (Strict): 0.6214953271028039
F1-Score Five Shot (Strict): 0.6373333333333333
F1-Score Ten Shot (Strict): 0.6628895184135978
F1-Score Twenty Shot (Strict): 0.685459940652819


In [11]:
print(f"F1-Score One Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'llama3_70b_instruct_one_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1-Score Five Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'llama3_70b_instruct_five_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1-Score Ten Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'llama3_70b_instruct_ten_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1-Score Twenty Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'llama3_70b_instruct_twenty_shot_labels', 'lenient')['default']['f1-score']}")

F1-Score One Shot (Lenient): 0.7429906542056075
F1-Score Five Shot (Lenient): 0.7573333333333333
F1-Score Ten Shot (Lenient): 0.7790368271954673
F1-Score Twenty Shot (Lenient): 0.7863501483679526


In [12]:
print(f"Average Llama-3-70B-Instruct one-shot prediction time: {ncbi_df['llama3_70b_instruct_one_shot_time'].mean():.2f} seconds")
print(f"Average Llama-3-70B-Instruct five-shot prediction time: {ncbi_df['llama3_70b_instruct_five_shot_time'].mean():.2f} seconds")
print(f"Average Llama-3-70B-Instruct ten-shot prediction time: {ncbi_df['llama3_70b_instruct_ten_shot_time'].mean():.2f} seconds")
print(f"Average Llama-3-70B-Instruct twenty-shot prediction time: {ncbi_df['llama3_70b_instruct_twenty_shot_time'].mean():.2f} seconds")

Average Llama-3-70B-Instruct one-shot prediction time: 7.40 seconds
Average Llama-3-70B-Instruct five-shot prediction time: 6.59 seconds
Average Llama-3-70B-Instruct ten-shot prediction time: 6.41 seconds
Average Llama-3-70B-Instruct twenty-shot prediction time: 6.29 seconds


In [13]:
# save the inference results
ncbi_df.to_csv('data/NER/NCBI-disease/test_200_llama3_70b_instruct_results.csv', index=False)

# 1.2 2018 n2c2 Dataset

### 1.2.1 Inference

In [14]:
n2c2_df = pd.read_csv('data/NER/2018_n2c2/test_200.csv')
n2c2_example_df = pd.read_csv('data/NER/2018_n2c2/examples.csv')

In [15]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to extract disease entities in a sentence. The entity type includes Form, Route, Frequency, Dosage, Strength, Duration, Reason, Ade, Drug."
"INPUT: the input is a sentence."
"OUTPUT: the output is an HTML that highlights all the disease entities in the sentence in different colors: Form(#FF0000), Route(#FFA500), Frequency(#FFFF00), Dosage(#00FF00), Strength(#0000FF), Duration(#800080), Reason(#FFC0CB), Ade(#964B00), Drug(#808080) in hex code. The highlighting should only use HTML tags <span style=\"background-color: #XXXXXX\"> and </span> and no other tags."
"""
def get_ner_2018_n2c2(sentence: str, shot: int = 0) -> str:
    """
    Get the NER results of 2018 n2c2 dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the NER results
    """
    messages = [
        {"role": "system", "content": system_message},
    ]
    for i in range(shot):
        messages.append({"role": "user", "content": n2c2_example_df. iloc[i]['text']}) 
        messages.append({"role": "assistant", "content": n2c2_example_df.iloc[i]['label_text']})
    messages.append({"role": "user", "content": sentence})
    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    time_start = time.time()
    outputs = pipeline(
        prompt,
        max_new_tokens=2048,
        eos_token_id=terminators,
        do_sample=False,
        pad_token_id=pipeline.tokenizer.eos_token_id
    )
    time_end = time.time()

    return outputs[0]["generated_text"][len(prompt):], time_end - time_start

In [16]:
for i in tqdm(range(0, len(n2c2_df), 1)):
    n2c2_df.loc[i, 'html_llama3_70b_instruct_one_shot'], n2c2_df.loc[i, 'llama3_70b_instruct_one_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 1)
    n2c2_df.loc[i, 'html_llama3_70b_instruct_five_shot'], n2c2_df.loc[i, 'llama3_70b_instruct_five_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 5)
    n2c2_df.loc[i, 'html_llama3_70b_instruct_ten_shot'], n2c2_df.loc[i, 'llama3_70b_instruct_ten_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 10)
    n2c2_df.loc[i, 'html_llama3_70b_instruct_twenty_shot'], n2c2_df.loc[i, 'llama3_70b_instruct_twenty_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 20)

100%|██████████| 200/200 [2:29:40<00:00, 44.90s/it]  


### 1.2.2 Evaluation

In [26]:
# Optional: you can just load the llm output from the csv file instead of running the above code
n2c2_df = pd.read_csv("data/NER/2018_n2c2/test_200_llama3_70b_instruct_results.csv")

In [28]:
n2c2_df['gt_labels'], n2c2_df['llama3_70b_instruct_one_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_llama3_70b_instruct_one_shot')
_, n2c2_df['llama3_70b_instruct_five_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_llama3_70b_instruct_five_shot')
_, n2c2_df['llama3_70b_instruct_ten_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_llama3_70b_instruct_ten_shot')
_, n2c2_df['llama3_70b_instruct_twenty_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_llama3_70b_instruct_twenty_shot')

In [29]:
print(f"F1 Score One Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'llama3_70b_instruct_one_shot_labels', 'strict'))}")
print(f"F1 Score Five Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'llama3_70b_instruct_five_shot_labels', 'strict'))}")
print(f"F1 Score Ten Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'llama3_70b_instruct_ten_shot_labels', 'strict'))}")
print(f"F1 Score Twenty Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'llama3_70b_instruct_twenty_shot_labels', 'strict'))}")

F1 Score One Shot (Strict): 0.1814137367727111
F1 Score Five Shot (Strict): 0.46791731342399345
F1 Score Ten Shot (Strict): 0.4989763604796711
F1 Score Twenty Shot (Strict): 0.5505756039667599


In [30]:
print(f"F1 Score One Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'llama3_70b_instruct_one_shot_labels', 'lenient'))}")
print(f"F1 Score Five Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'llama3_70b_instruct_five_shot_labels', 'lenient'))}")
print(f"F1 Score Ten Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'llama3_70b_instruct_ten_shot_labels', 'lenient'))}")
print(f"F1 Score Twenty Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'llama3_70b_instruct_twenty_shot_labels', 'lenient'))}")

F1 Score One Shot (Lenient): 0.2736644463557607
F1 Score Five Shot (Lenient): 0.5903324689808858
F1 Score Ten Shot (Lenient): 0.6261996528827135
F1 Score Twenty Shot (Lenient): 0.6952576079043253


In [31]:
print(f"Average Llama-3-70B-Instruct one-shot prediction time: {n2c2_df['llama3_70b_instruct_one_shot_time'].mean():.2f} seconds")
print(f"Average Llama-3-70B-Instruct five-shot prediction time: {n2c2_df['llama3_70b_instruct_five_shot_time'].mean():.2f} seconds")
print(f"Average Llama-3-70B-Instruct ten-shot prediction time: {n2c2_df['llama3_70b_instruct_ten_shot_time'].mean():.2f} seconds")
print(f"Average Llama-3-70B-Instruct twenty-shot prediction time: {n2c2_df['llama3_70b_instruct_twenty_shot_time'].mean():.2f} seconds")

Average Llama-3-70B-Instruct one-shot prediction time: 9.75 seconds
Average Llama-3-70B-Instruct five-shot prediction time: 11.77 seconds
Average Llama-3-70B-Instruct ten-shot prediction time: 11.09 seconds
Average Llama-3-70B-Instruct twenty-shot prediction time: 12.29 seconds


In [33]:
# save the inference results
n2c2_df.to_csv('data/NER/2018_n2c2/test_200_llama3_70b_instruct_results.csv', index=False)

# 2. RE (Relation Extraction)

## 2.1 2018 n2c2 Dataset

### 2.1.1 Infernece

In [18]:
n2c2_df = pd.read_csv('data/RE/2018_n2c2/test_200.csv')
n2c2_example_df = pd.read_csv('data/RE/2018_n2c2/examples.csv')

In [19]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to classify relations for a sentence."
"INPUT: the input is a sentence where the entities are labeled within [E${X}] and [E${X}/] in a sentence, where X is an integer representing an unique entity."
"OUTPUT: your task is to select one out of the nine types of relations ('STRENGTH-DRUG', 'ROUTE-DRUG', 'FREQUENCY-DRUG', 'FORM-DRUG', 'DOSAGE-DRUG', 'REASON-DRUG', 'DURATION-DRUG', 'ADE-DRUG', and 'No relation')."
"""
def get_re_2018_n2c2(sentence: str, shot: int = 0) -> str:
    """
    Get the RE results of 2018 n2c2 dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the RE results
    """
    
    messages = [
        {"role": "system", "content": system_message},
    ]
    for i in range(shot):
        messages.append({"role": "user", "content": n2c2_example_df. iloc[i]['text']}) 
        messages.append({"role": "assistant", "content": n2c2_example_df.iloc[i]['labels']})
    messages.append({"role": "user", "content": sentence})
    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    time_start = time.time()
    outputs = pipeline(
        prompt,
        max_new_tokens=2048,
        eos_token_id=terminators,
        do_sample=False,
        pad_token_id=pipeline.tokenizer.eos_token_id
    )
    time_end = time.time()

    return outputs[0]["generated_text"][len(prompt):], time_end - time_start

In [20]:
for i in tqdm(range(0, len(n2c2_df), 1)):
    n2c2_df.loc[i, 'llama3_70b_instruct_one_shot'], n2c2_df.loc[i, 'llama3_70b_instruct_one_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 1)
    n2c2_df.loc[i, 'llama3_70b_instruct_five_shot'], n2c2_df.loc[i, 'llama3_70b_instruct_five_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 5)
    n2c2_df.loc[i, 'llama3_70b_instruct_ten_shot'], n2c2_df.loc[i, 'llama3_70b_instruct_ten_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 10)
    n2c2_df.loc[i, 'llama3_70b_instruct_twenty_shot'], n2c2_df.loc[i, 'llama3_70b_instruct_twenty_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 20)

100%|██████████| 200/200 [10:44<00:00,  3.22s/it]


### 2.1.2 Evaluation

In [41]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# n2c2_df = pd.read_csv("data/RE/2018_n2c2/test_200_llama3_70b_instruct_results.csv")

In [36]:
# get rid of ' ' if any
n2c2_df['llama3_70b_instruct_one_shot'] = n2c2_df['llama3_70b_instruct_one_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['llama3_70b_instruct_five_shot'] = n2c2_df['llama3_70b_instruct_five_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['llama3_70b_instruct_ten_shot'] = n2c2_df['llama3_70b_instruct_ten_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['llama3_70b_instruct_twenty_shot'] = n2c2_df['llama3_70b_instruct_twenty_shot'].apply(lambda x: x[1:-1] if "'" in x else x)

In [42]:
# get digit label while considering failed LLM outputs as 'No relation'
n2c2_df['labels'] = n2c2_df['labels'].apply(get_digit)
n2c2_df['llama3_70b_instruct_one_shot_labels'] = n2c2_df['llama3_70b_instruct_one_shot'].apply(get_digit)
n2c2_df['llama3_70b_instruct_five_shot_labels'] = n2c2_df['llama3_70b_instruct_five_shot'].apply(get_digit)
n2c2_df['llama3_70b_instruct_ten_shot_labels'] = n2c2_df['llama3_70b_instruct_ten_shot'].apply(get_digit)
n2c2_df['llama3_70b_instruct_twenty_shot_labels'] = n2c2_df['llama3_70b_instruct_twenty_shot'].apply(get_digit)

In [43]:
y_true = n2c2_df['labels'].tolist()
y_pred = n2c2_df['llama3_70b_instruct_one_shot_labels'].tolist()
print(f"F1 Score One Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['llama3_70b_instruct_five_shot_labels'].tolist()
print(f"F1 Score Five Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['llama3_70b_instruct_ten_shot_labels'].tolist()
print(f"F1 Score Ten Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['llama3_70b_instruct_twenty_shot_labels'].tolist()
print(f"F1 Score Twenty Shot: {f1_score(y_true, y_pred, average='macro')}")

F1 Score One Shot: 0.18366807609619562
F1 Score Five Shot: 0.24288825512056
F1 Score Ten Shot: 0.31791798685738076
F1 Score Twenty Shot: 0.3190275703328499


In [44]:
print(f"Average Llama-3-70B-Instruct one-shot prediction time: {n2c2_df['llama3_70b_instruct_one_shot_time'].mean():.2f} seconds")
print(f"Average Llama-3-70B-Instruct five-shot prediction time: {n2c2_df['llama3_70b_instruct_five_shot_time'].mean():.2f} seconds")
print(f"Average Llama-3-70B-Instruct ten-shot prediction time: {n2c2_df['llama3_70b_instruct_ten_shot_time'].mean():.2f} seconds")
print(f"Average Llama-3-70B-Instruct twenty-shot prediction time: {n2c2_df['llama3_70b_instruct_twenty_shot_time'].mean():.2f} seconds")

Average Llama-3-70B-Instruct one-shot prediction time: 0.65 seconds
Average Llama-3-70B-Instruct five-shot prediction time: 0.97 seconds
Average Llama-3-70B-Instruct ten-shot prediction time: 0.69 seconds
Average Llama-3-70B-Instruct twenty-shot prediction time: 0.91 seconds


In [45]:
# save the inference results
n2c2_df.to_csv('data/RE/2018_n2c2/test_200_llama3_70b_instruct_results.csv', index=False)

## 2.2 GAD

### 2.2.1 Inference

In [22]:
gad_df = pd.read_csv('data/RE/GAD/test_200.csv')
gad_example_df = pd.read_csv('data/RE/GAD/examples.csv')

In [23]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to classify relations between a disease and a gene for a sentence."
"INPUT: the input is a sentence where the disease is labeled as @DISEASE$ and the gene is labeled as @GENE$ accordingly in a sentence. "
"OUTPUT: your task is to select one out of the two types of relations (0 and 1) for the gene and disease without any explanation or other characters: 
0, no relations 
1, has relations"
"""
def get_re_gad(sentence: str, shot: int = 0) -> str:
    """
    Get the RE results of GAD dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the RE results
    """

    messages = [
        {"role": "system", "content": system_message},
    ]
    for i in range(shot):
        messages.append({"role": "user", "content": gad_example_df. iloc[i]['text']}) 
        messages.append({"role": "assistant", "content": gad_example_df.iloc[i]['labels']})
    messages.append({"role": "user", "content": sentence})
    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    time_start = time.time()
    outputs = pipeline(
        prompt,
        max_new_tokens=2048,
        eos_token_id=terminators,
        do_sample=False,
        pad_token_id=pipeline.tokenizer.eos_token_id
    )
    time_end = time.time()

    return outputs[0]["generated_text"][len(prompt):], time_end - time_start

In [24]:
for i in tqdm(range(0, len(gad_df), 1)):
    gad_df.loc[i, 'llama3_70b_instruct_one_shot'], gad_df.loc[i, 'llama3_70b_instruct_one_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 1)
    gad_df.loc[i, 'llama3_70b_instruct_five_shot'], gad_df.loc[i, 'llama3_70b_instruct_five_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 5)
    gad_df.loc[i, 'llama3_70b_instruct_ten_shot'], gad_df.loc[i, 'llama3_70b_instruct_ten_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 10)
    gad_df.loc[i, 'llama3_70b_instruct_twenty_shot'], gad_df.loc[i, 'llama3_70b_instruct_twenty_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 20)

100%|██████████| 200/200 [05:04<00:00,  1.52s/it]


### 2.2.2 Evaluation

In [50]:
# Optional: you can just load the llm output from the csv file instead of running the above code
gad_df = pd.read_csv("data/RE/GAD/test_200_llama3_70b_instruct_results.csv")

In [51]:
y_true = gad_df['labels'].tolist()
y_pred = gad_df['llama3_70b_instruct_one_shot'].tolist()
print(f"F1 Score One Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['llama3_70b_instruct_five_shot'].tolist()
print(f"F1 Score Five Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['llama3_70b_instruct_ten_shot'].tolist()
print(f"F1 Score Ten Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['llama3_70b_instruct_twenty_shot'].tolist()
print(f"F1 Score Twenty Shot: {f1_score(y_true, y_pred, average='macro')}")

F1 Score One Shot: 0.4521130881140456
F1 Score Five Shot: 0.4560100428915158
F1 Score Ten Shot: 0.4575378538512179
F1 Score Twenty Shot: 0.4522529971062422


In [53]:
print(f"Average Llama-3-70B-Instruct one-shot prediction time: {gad_df['llama3_70b_instruct_one_shot_time'].mean():.2f} seconds")
print(f"Average Llama-3-70B-Instruct five-shot prediction time: {gad_df['llama3_70b_instruct_five_shot_time'].mean():.2f} seconds")
print(f"Average Llama-3-70B-Instruct ten-shot prediction time: {gad_df['llama3_70b_instruct_ten_shot_time'].mean():.2f} seconds")
print(f"Average Llama-3-70B-Instruct twenty-shot prediction time: {gad_df['llama3_70b_instruct_twenty_shot_time'].mean():.2f} seconds")

Average Llama-3-70B-Instruct one-shot prediction time: 0.29 seconds
Average Llama-3-70B-Instruct five-shot prediction time: 0.34 seconds
Average Llama-3-70B-Instruct ten-shot prediction time: 0.40 seconds
Average Llama-3-70B-Instruct twenty-shot prediction time: 0.50 seconds


In [54]:
# save the inference results
gad_df.to_csv('data/RE/GAD/test_200_llama3_70b_instruct_results.csv', index=False)