In [88]:
import time
import torch
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import f1_score
from utils import html_parsing_ncbi, html_parsing_n2c2, get_classification_report, get_digit, get_macro_average_f1, label2digit

from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# 85GB GPU memory is required to run this model in half precision
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype=torch.float16,
                                             attn_implementation="flash_attention_2",
                                             device_map="auto")

In [2]:
# load third-party chat templates
!git clone https://github.com/chujiezheng/chat_templates.git

Cloning into 'chat_templates'...
remote: Enumerating objects: 184, done.[K
remote: Counting objects: 100% (184/184), done.[K
remote: Compressing objects: 100% (125/125), done.[K
remote: Total 184 (delta 110), reused 122 (delta 56), pack-reused 0[K
Receiving objects: 100% (184/184), 30.70 KiB | 308.00 KiB/s, done.
Resolving deltas: 100% (110/110), done.


In [2]:
chat_template = open('./chat_templates/chat_templates/mistral-instruct.jinja').read()
chat_template = chat_template.replace('    ', '').replace('\n', '')
tokenizer.chat_template = chat_template

# 1. NER (Named Entity Recognition)

## 1.1 NCBI-Disease Dataset

### 1.1.1 Inference

In [3]:
ncbi_df = pd.read_csv('data/NER/NCBI-disease/test_200.csv')
ncbi_example_df = pd.read_csv('data/NER/NCBI-disease/examples.csv')

In [4]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to extract disease entities in a sentence."
"INPUT: the input is a sentence."
"OUTPUT: the output is an HTML that highlights all the disease entities in the sentence. The highlighting should only use HTML tags <span style=\"background-color: #FFFF00\"> and </span> and no other tags."
"""

def get_ner_ncbi_disease(sentence: str, shot: int = 0) -> str:
    """
    Get the NER results of NCBI-disease dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the NER results
    """
    messages=[
        {'role': 'system', 'content': system_message}
    ]
    for i in range(shot):
        messages.append({'role': 'user', 'content': ncbi_example_df. iloc[i]['text']}) 
        messages.append({'role': 'assistant', 'content': ncbi_example_df.iloc[i]['label_text']})
    messages.append({'role': 'user', 'content': sentence})
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

    time_start = time.time()
    outputs = model.generate(input_ids, max_new_tokens=2048)
    time_end = time.time()
    return tokenizer.decode(outputs[0], skip_special_tokens=True), time_end - time_start

In [9]:
for i in tqdm(range(0, len(ncbi_df), 1)):
    ncbi_df.loc[i, 'html_mistral_8x7b_instruct_one_shot'], ncbi_df.loc[i, 'mistral_8x7b_instruct_one_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 1)
    ncbi_df.loc[i, 'html_mistral_8x7b_instruct_five_shot'], ncbi_df.loc[i, 'mistral_8x7b_instruct_five_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 5)
    ncbi_df.loc[i, 'html_mistral_8x7b_instruct_ten_shot'], ncbi_df.loc[i, 'mistral_8x7b_instruct_ten_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 10)
    ncbi_df.loc[i, 'html_mistral_8x7b_instruct_twenty_shot'], ncbi_df.loc[i, 'mistral_8x7b_instruct_twenty_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 20)

  0%|          | 0/200 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:

### 1.1.2 Evaluation

In [48]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# ncbi_df = pd.read_csv("data/NER/NCBI-disease/test_200_mistral_8x7b_instruct_results.csv")

In [49]:
def mistral_parser_ner(text: str):
    """
    Parse the text generation output.
    """
    # find the last "[/INST] " and use the text after it
    cleaned_text = text.split('[/INST] ')[-1]
    # find the first "\n" and use the text before it
    cleaned_text = cleaned_text.split('\n\n')[0]
    return cleaned_text

In [50]:
print(ncbi_df.iloc[1]['html_mistral_8x7b_instruct_one_shot'])
print("====================================")
print(mistral_parser_ner(ncbi_df.iloc[0]['html_mistral_8x7b_instruct_one_shot']))

[INST] You are a helpful assistant to perform the following task.
"TASK: the task is to extract disease entities in a sentence."
"INPUT: the input is a sentence."
"OUTPUT: the output is an HTML that highlights all the disease entities in the sentence. The highlighting should only use HTML tags <span style="background-color: #FFFF00"> and </span> and no other tags."

Myotonic dystrophy ( DM ) is the most prevalent inherited neuromuscular disease in adults . [/INST] <span style="background-color: #FFFF00">Myotonic dystrophy</span> ( <span style="background-color: #FFFF00">DM</span> ) is the most prevalent <span style="background-color: #FFFF00">inherited neuromuscular disease</span> in adults .  [INST] Evidence for linkage of bipolar disorder to chromosome 18 with a parent - of - origin effect . [/INST] Evidence for linkage of <span style="background-color: #FFFF00">bipolar disorder</span> to chromosome 18 with a parent-of-origin effect . 

In this sentence, "bipolar disorder" is the onl

In [51]:
ncbi_df['mistral_8x7b_instruct_one_shot'] = ncbi_df['html_mistral_8x7b_instruct_one_shot'].apply(mistral_parser_ner)
ncbi_df['mistral_8x7b_instruct_five_shot'] = ncbi_df['html_mistral_8x7b_instruct_five_shot'].apply(mistral_parser_ner)
ncbi_df['mistral_8x7b_instruct_ten_shot'] = ncbi_df['html_mistral_8x7b_instruct_ten_shot'].apply(mistral_parser_ner)
ncbi_df['mistral_8x7b_instruct_twenty_shot'] = ncbi_df['html_mistral_8x7b_instruct_twenty_shot'].apply(mistral_parser_ner)

In [52]:
ncbi_df['gt_labels'], ncbi_df['mistral_8x7b_instruct_one_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_mistral_8x7b_instruct_one_shot')
_, ncbi_df['mistral_8x7b_instruct_five_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_mistral_8x7b_instruct_five_shot')
_, ncbi_df['mistral_8x7b_instruct_ten_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_mistral_8x7b_instruct_ten_shot')
_, ncbi_df['mistral_8x7b_instruct_twenty_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_mistral_8x7b_instruct_twenty_shot')

In [53]:
print(f"F1-Score One Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'mistral_8x7b_instruct_one_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1-Score Five Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'mistral_8x7b_instruct_five_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1-Score Ten Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'mistral_8x7b_instruct_ten_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1-Score Twenty Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'mistral_8x7b_instruct_twenty_shot_labels', 'strict')['default']['f1-score']}")

F1-Score One Shot (Strict): 0.408695652173913
F1-Score Five Shot (Strict): 0.29549549549549553
F1-Score Ten Shot (Strict): 0.34581105169340465
F1-Score Twenty Shot (Strict): 0.3358490566037736


In [54]:
print(f"F1-Score One Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'mistral_8x7b_instruct_one_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1-Score Five Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'mistral_8x7b_instruct_five_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1-Score Ten Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'mistral_8x7b_instruct_ten_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1-Score Twenty Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'mistral_8x7b_instruct_twenty_shot_labels', 'lenient')['default']['f1-score']}")

F1-Score One Shot (Lenient): 0.5869565217391304
F1-Score Five Shot (Lenient): 0.4108108108108108
F1-Score Ten Shot (Lenient): 0.43137254901960786
F1-Score Twenty Shot (Lenient): 0.4226415094339622


In [55]:
print(f"Average Mistral-8x7B-Instruct one-shot prediction time: {ncbi_df['mistral_8x7b_instruct_one_shot_time'].mean():.2f} seconds")
print(f"Average Mistral-8x7B-Instruct five-shot prediction time: {ncbi_df['mistral_8x7b_instruct_five_shot_time'].mean():.2f} seconds")
print(f"Average Mistral-8x7B-Instruct ten-shot prediction time: {ncbi_df['mistral_8x7b_instruct_ten_shot_time'].mean():.2f} seconds")
print(f"Average Mistral-8x7B-Instruct twenty-shot prediction time: {ncbi_df['mistral_8x7b_instruct_twenty_shot_time'].mean():.2f} seconds")

Average Mistral-8x7B-Instruct one-shot prediction time: 19.20 seconds
Average Mistral-8x7B-Instruct five-shot prediction time: 11.38 seconds
Average Mistral-8x7B-Instruct ten-shot prediction time: 13.51 seconds
Average Mistral-8x7B-Instruct twenty-shot prediction time: 9.77 seconds


In [56]:
# save the inference results
ncbi_df.to_csv('data/NER/NCBI-disease/test_200_mistral_8x7b_instruct_results.csv', index=False)

# 1.2 2018 n2c2 Dataset

### 1.2.1 Inference

In [12]:
n2c2_df = pd.read_csv('data/NER/2018_n2c2/test_200.csv')
n2c2_example_df = pd.read_csv('data/NER/2018_n2c2/examples.csv')

In [13]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to extract disease entities in a sentence. The entity type includes Form, Route, Frequency, Dosage, Strength, Duration, Reason, Ade, Drug."
"INPUT: the input is a sentence."
"OUTPUT: the output is an HTML that highlights all the disease entities in the sentence in different colors: Form(#FF0000), Route(#FFA500), Frequency(#FFFF00), Dosage(#00FF00), Strength(#0000FF), Duration(#800080), Reason(#FFC0CB), Ade(#964B00), Drug(#808080) in hex code. The highlighting should only use HTML tags <span style=\"background-color: #XXXXXX\"> and </span> and no other tags."
"""
def get_ner_2018_n2c2(sentence: str, shot: int = 0) -> str:
    """
    Get the NER results of 2018 n2c2 dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the NER results
    """

    messages=[
        {'role': 'system', 'content': system_message}
    ]
    for i in range(shot):
        messages.append({'role': 'user', 'content': n2c2_example_df. iloc[i]['text']}) 
        messages.append({'role': 'assistant', 'content': n2c2_example_df.iloc[i]['label_text']})
    messages.append({'role': 'user', 'content': sentence})
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

    time_start = time.time()
    outputs = model.generate(input_ids, max_new_tokens=2048)
    time_end = time.time()

    return tokenizer.decode(outputs[0], skip_special_tokens=True), time_end - time_start

In [14]:
for i in tqdm(range(0, len(n2c2_df), 1)):
    n2c2_df.loc[i, 'html_mistral_8x7b_instruct_one_shot'], n2c2_df.loc[i, 'mistral_8x7b_instruct_one_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 1)
    n2c2_df.loc[i, 'html_mistral_8x7b_instruct_five_shot'], n2c2_df.loc[i, 'mistral_8x7b_instruct_five_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 5)
    n2c2_df.loc[i, 'html_mistral_8x7b_instruct_ten_shot'], n2c2_df.loc[i, 'mistral_8x7b_instruct_ten_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 10)
    n2c2_df.loc[i, 'html_mistral_8x7b_instruct_twenty_shot'], n2c2_df.loc[i, 'mistral_8x7b_instruct_twenty_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 20)

  0%|          | 0/200 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:

### 1.2.2 Evaluation

In [57]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# n2c2_df = pd.read_csv("data/NER/2018_n2c2/test_200_mistral_8x7b_instruct_results.csv")

In [58]:
print(n2c2_df.iloc[1]['html_mistral_8x7b_instruct_one_shot'])
print("====================================")
print(mistral_parser_ner(n2c2_df.iloc[1]['html_mistral_8x7b_instruct_one_shot']))

[INST] You are a helpful assistant to perform the following task.
"TASK: the task is to extract disease entities in a sentence. The entity type includes Form, Route, Frequency, Dosage, Strength, Duration, Reason, Ade, Drug."
"INPUT: the input is a sentence."
"OUTPUT: the output is an HTML that highlights all the disease entities in the sentence in different colors: Form(#FF0000), Route(#FFA500), Frequency(#FFFF00), Dosage(#00FF00), Strength(#0000FF), Duration(#800080), Reason(#FFC0CB), Ade(#964B00), Drug(#808080) in hex code. The highlighting should only use HTML tags <span style="background-color: #XXXXXX"> and </span> and no other tags."

Surveillance blood cx from [ ** 10 - 6 ** ] were negative for 48 hrs , pt was continued on cipro / flagyl but developed leukopenia and flagyl was discontinued ( bc can contribute to leukopenia ) and changed to PO vancomycin . [/INST] Surveillance blood cx from [ ** 10 - 6 ** ] were negative for 48 hrs , pt was continued on <span style="background-co

In [59]:
n2c2_df['mistral_8x7b_instruct_one_shot'] = n2c2_df['html_mistral_8x7b_instruct_one_shot'].apply(mistral_parser_ner)
n2c2_df['mistral_8x7b_instruct_five_shot'] = n2c2_df['html_mistral_8x7b_instruct_five_shot'].apply(mistral_parser_ner)
n2c2_df['mistral_8x7b_instruct_ten_shot'] = n2c2_df['html_mistral_8x7b_instruct_ten_shot'].apply(mistral_parser_ner)
n2c2_df['mistral_8x7b_instruct_twenty_shot'] = n2c2_df['html_mistral_8x7b_instruct_twenty_shot'].apply(mistral_parser_ner)

In [60]:
n2c2_df['gt_labels'], n2c2_df['mistral_8x7b_instruct_one_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_mistral_8x7b_instruct_one_shot')
_, n2c2_df['mistral_8x7b_instruct_five_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_mistral_8x7b_instruct_five_shot')
_, n2c2_df['mistral_8x7b_instruct_ten_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_mistral_8x7b_instruct_ten_shot')
_, n2c2_df['mistral_8x7b_instruct_twenty_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_mistral_8x7b_instruct_twenty_shot')

In [61]:
print(f"F1 Score One Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'mistral_8x7b_instruct_one_shot_labels', 'strict'))}")
print(f"F1 Score Five Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'mistral_8x7b_instruct_five_shot_labels', 'strict'))}")
print(f"F1 Score Ten Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'mistral_8x7b_instruct_ten_shot_labels', 'strict'))}")
print(f"F1 Score Twenty Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'mistral_8x7b_instruct_twenty_shot_labels', 'strict'))}")

F1 Score One Shot (Strict): 0.1846268860926377
F1 Score Five Shot (Strict): 0.4106159032435094
F1 Score Ten Shot (Strict): 0.4430549191789419
F1 Score Twenty Shot (Strict): 0.5139976684831742


In [62]:
print(f"F1 Score One Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'mistral_8x7b_instruct_one_shot_labels', 'lenient'))}")
print(f"F1 Score Five Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'mistral_8x7b_instruct_five_shot_labels', 'lenient'))}")
print(f"F1 Score Ten Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'mistral_8x7b_instruct_ten_shot_labels', 'lenient'))}")
print(f"F1 Score Twenty Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'mistral_8x7b_instruct_twenty_shot_labels', 'lenient'))}")

F1 Score One Shot (Lenient): 0.2628166607509294
F1 Score Five Shot (Lenient): 0.5236369118314861
F1 Score Ten Shot (Lenient): 0.5525020113944765
F1 Score Twenty Shot (Lenient): 0.6483721627699033


In [63]:
print(f"Average Mistral-8x7B-Instruct one-shot prediction time: {n2c2_df['mistral_8x7b_instruct_one_shot_time'].mean():.2f} seconds")
print(f"Average Mistral-8x7B-Instruct five-shot prediction time: {n2c2_df['mistral_8x7b_instruct_five_shot_time'].mean():.2f} seconds")
print(f"Average Mistral-8x7B-Instruct ten-shot prediction time: {n2c2_df['mistral_8x7b_instruct_ten_shot_time'].mean():.2f} seconds")
print(f"Average Mistral-8x7B-Instruct twenty-shot prediction time: {n2c2_df['mistral_8x7b_instruct_twenty_shot_time'].mean():.2f} seconds")

Average Mistral-8x7B-Instruct one-shot prediction time: 20.44 seconds
Average Mistral-8x7B-Instruct five-shot prediction time: 23.10 seconds
Average Mistral-8x7B-Instruct ten-shot prediction time: 17.44 seconds
Average Mistral-8x7B-Instruct twenty-shot prediction time: 21.32 seconds


In [64]:
# save the inference results
n2c2_df.to_csv('data/NER/2018_n2c2/test_200_mistral_8x7b_instruct_results.csv', index=False)

# 2. RE (Relation Extraction)

## 2.1 2018 n2c2 Dataset

### 2.1.1 Infernece

In [16]:
n2c2_df = pd.read_csv('data/RE/2018_n2c2/test_200.csv')
n2c2_example_df = pd.read_csv('data/RE/2018_n2c2/examples.csv')

In [17]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to classify relations for a sentence."
"INPUT: the input is a sentence where the entities are labeled within [E${X}] and [E${X}/] in a sentence, where X is an integer representing an unique entity."
"OUTPUT: your task is to select one out of the nine types of relations ('STRENGTH-DRUG', 'ROUTE-DRUG', 'FREQUENCY-DRUG', 'FORM-DRUG', 'DOSAGE-DRUG', 'REASON-DRUG', 'DURATION-DRUG', 'ADE-DRUG', and 'No relation')."
"""
def get_re_2018_n2c2(sentence: str, shot: int = 0) -> str:
    """
    Get the RE results of 2018 n2c2 dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the RE results
    """
    
    messages=[
        {'role': 'system', 'content': system_message}
    ]
    for i in range(shot):
        messages.append({'role': 'user', 'content': n2c2_example_df. iloc[i]['text']}) 
        messages.append({'role': 'assistant', 'content': n2c2_example_df.iloc[i]['labels']})
    messages.append({'role': 'user', 'content': sentence})
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

    time_start = time.time()
    outputs = model.generate(input_ids, max_new_tokens=2048)
    time_end = time.time()

    return tokenizer.decode(outputs[0], skip_special_tokens=True), time_end - time_start

In [18]:
for i in tqdm(range(0, len(n2c2_df), 1)):
    n2c2_df.loc[i, 'mistral_8x7b_instruct_one_shot'], n2c2_df.loc[i, 'mistral_8x7b_instruct_one_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 1)
    n2c2_df.loc[i, 'mistral_8x7b_instruct_five_shot'], n2c2_df.loc[i, 'mistral_8x7b_instruct_five_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 5)
    n2c2_df.loc[i, 'mistral_8x7b_instruct_ten_shot'], n2c2_df.loc[i, 'mistral_8x7b_instruct_ten_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 10)
    n2c2_df.loc[i, 'mistral_8x7b_instruct_twenty_shot'], n2c2_df.loc[i, 'mistral_8x7b_instruct_twenty_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 20)

  0%|          | 0/200 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:

### 2.1.2 Evaluation

In [89]:
label_list = label2digit.keys()
def mistral_parser_re_n2c2(text: str):
    """
    Parse the text generation output.
    """
    # find the last "[/INST] " and use the text after it
    cleaned_text = text.split('[/INST] ')[-1]
    # find the first "\n" and use the text before it
    cleaned_text = cleaned_text.split('\n\n')[0]
    # find the first occurrence of anything in the label_list and remove the text after it
    if cleaned_text.split(' ')[0] in label_list:
        cleaned_text = cleaned_text.split(' ')[0]
    elif cleaned_text.split(' ')[0] == 'No':
        cleaned_text = 'No relation'
    return cleaned_text

In [91]:
print(n2c2_df.iloc[100]['mistral_8x7b_instruct_five_shot'])
print("====================================")
print(mistral_parser_re_n2c2(n2c2_df.iloc[100]['mistral_8x7b_instruct_five_shot']))

[INST] You are a helpful assistant to perform the following task.
"TASK: the task is to classify relations for a sentence."
"INPUT: the input is a sentence where the entities are labeled within [E${X}] and [E${X}/] in a sentence, where X is an integer representing an unique entity."
"OUTPUT: your task is to select one out of the nine types of relations ('STRENGTH-DRUG', 'ROUTE-DRUG', 'FREQUENCY-DRUG', 'FORM-DRUG', 'DOSAGE-DRUG', 'REASON-DRUG', 'DURATION-DRUG', 'ADE-DRUG', and 'No relation')."

Other side effects during [E2] IL/-/2 [E2/] therapy included mild/chills ; development of an erythematous/skin/rash ; [E1] nausea [E1/] , improved with lorazepam ; diarrhea , improved with Lomotil , and fatigue During this week , he developed acute renal failure with a peak creatinine of 7.3 , improved to 1.3 at the time of discharge . [/INST] ADE-DRUG  [INST] Pt was weaned off of noninvasive ventilation on the evening of [ ** 10 - 26 ** ] and has been having an [E2] oxygen [E2/] sat of 100 % on 

In [92]:
n2c2_df['mistral_8x7b_instruct_one_shot'] = n2c2_df['mistral_8x7b_instruct_one_shot'].apply(mistral_parser_re_n2c2)
n2c2_df['mistral_8x7b_instruct_five_shot'] = n2c2_df['mistral_8x7b_instruct_five_shot'].apply(mistral_parser_re_n2c2)
n2c2_df['mistral_8x7b_instruct_ten_shot'] = n2c2_df['mistral_8x7b_instruct_ten_shot'].apply(mistral_parser_re_n2c2)
n2c2_df['mistral_8x7b_instruct_twenty_shot'] = n2c2_df['mistral_8x7b_instruct_twenty_shot'].apply(mistral_parser_re_n2c2)

In [93]:
# get rid of ' ' if any
n2c2_df['mistral_8x7b_instruct_one_shot'] = n2c2_df['mistral_8x7b_instruct_one_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['mistral_8x7b_instruct_five_shot'] = n2c2_df['mistral_8x7b_instruct_five_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['mistral_8x7b_instruct_ten_shot'] = n2c2_df['mistral_8x7b_instruct_ten_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['mistral_8x7b_instruct_twenty_shot'] = n2c2_df['mistral_8x7b_instruct_twenty_shot'].apply(lambda x: x[1:-1] if "'" in x else x)

In [94]:
# get digit label while considering failed LLM outputs as 'No relation'
n2c2_df['labels'] = n2c2_df['labels'].apply(get_digit)
n2c2_df['mistral_8x7b_instruct_one_shot_labels'] = n2c2_df['mistral_8x7b_instruct_one_shot'].apply(get_digit)
n2c2_df['mistral_8x7b_instruct_five_shot_labels'] = n2c2_df['mistral_8x7b_instruct_five_shot'].apply(get_digit)
n2c2_df['mistral_8x7b_instruct_ten_shot_labels'] = n2c2_df['mistral_8x7b_instruct_ten_shot'].apply(get_digit)
n2c2_df['mistral_8x7b_instruct_twenty_shot_labels'] = n2c2_df['mistral_8x7b_instruct_twenty_shot'].apply(get_digit)

In [75]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# n2c2_df = pd.read_csv("data/RE/2018_n2c2/test_200_mistral_8x7b_instruct_results.csv")

In [95]:
y_true = n2c2_df['labels'].tolist()
y_pred = n2c2_df['mistral_8x7b_instruct_one_shot_labels'].tolist()
print(f"F1 Score One Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['mistral_8x7b_instruct_five_shot_labels'].tolist()
print(f"F1 Score Five Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['mistral_8x7b_instruct_ten_shot_labels'].tolist()
print(f"F1 Score Ten Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['mistral_8x7b_instruct_twenty_shot_labels'].tolist()
print(f"F1 Score Twenty Shot: {f1_score(y_true, y_pred, average='macro')}")

F1 Score One Shot: 0.24113132984100727
F1 Score Five Shot: 0.3111334779077486
F1 Score Ten Shot: 0.31373666884835116
F1 Score Twenty Shot: 0.2784438411853081


In [96]:
print(f"Average Mistral-8x7B-Instruct one-shot prediction time: {n2c2_df['mistral_8x7b_instruct_one_shot_time'].mean():.2f} seconds")
print(f"Average Mistral-8x7B-Instruct five-shot prediction time: {n2c2_df['mistral_8x7b_instruct_five_shot_time'].mean():.2f} seconds")
print(f"Average Mistral-8x7B-Instruct ten-shot prediction time: {n2c2_df['mistral_8x7b_instruct_ten_shot_time'].mean():.2f} seconds")
print(f"Average Mistral-8x7B-Instruct twenty-shot prediction time: {n2c2_df['mistral_8x7b_instruct_twenty_shot_time'].mean():.2f} seconds")

Average Mistral-8x7B-Instruct one-shot prediction time: 4.94 seconds
Average Mistral-8x7B-Instruct five-shot prediction time: 3.83 seconds
Average Mistral-8x7B-Instruct ten-shot prediction time: 23.50 seconds
Average Mistral-8x7B-Instruct twenty-shot prediction time: 1.32 seconds


In [97]:
# save the inference results
n2c2_df.to_csv('data/RE/2018_n2c2/test_200_mistral_8x7b_instruct_results.csv', index=False)

## 2.2 GAD

### 2.2.1 Inference

In [20]:
gad_df = pd.read_csv('data/RE/GAD/test_200.csv')
gad_example_df = pd.read_csv('data/RE/GAD/examples.csv')

In [24]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to classify relations between a disease and a gene for a sentence."
"INPUT: the input is a sentence where the disease is labeled as @DISEASE$ and the gene is labeled as @GENE$ accordingly in a sentence. "
"OUTPUT: your task is to select one out of the two types of relations (0 and 1) for the gene and disease without any explanation or other characters: 
0, no relations 
1, has relations"
"""
def get_re_gad(sentence: str, shot: int = 0) -> str:
    """
    Get the RE results of GAD dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the RE results
    """

    messages=[
        {'role': 'system', 'content': system_message}
    ]
    for i in range(shot):
        messages.append({'role': 'user', 'content': gad_example_df. iloc[i]['text']}) 
        messages.append({'role': 'assistant', 'content': str(gad_example_df.iloc[i]['labels'])})
    messages.append({'role': 'user', 'content': sentence})
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

    time_start = time.time()
    outputs = model.generate(input_ids, max_new_tokens=2048)
    time_end = time.time()

    return tokenizer.decode(outputs[0], skip_special_tokens=True), time_end - time_start

In [25]:
for i in tqdm(range(0, len(gad_df), 1)):
    gad_df.loc[i, 'mistral_8x7b_instruct_one_shot'], gad_df.loc[i, 'mistral_8x7b_instruct_one_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 1)
    gad_df.loc[i, 'mistral_8x7b_instruct_five_shot'], gad_df.loc[i, 'mistral_8x7b_instruct_five_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 5)
    gad_df.loc[i, 'mistral_8x7b_instruct_ten_shot'], gad_df.loc[i, 'mistral_8x7b_instruct_ten_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 10)
    gad_df.loc[i, 'mistral_8x7b_instruct_twenty_shot'], gad_df.loc[i, 'mistral_8x7b_instruct_twenty_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 20)

  0%|          | 0/200 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:

### 2.2.2 Evaluation

In [105]:
def mistral_parser_re_gad(text: str):
    """
    Parse the text generation output.
    """
    # find the last "[/INST] " and use the text after it
    cleaned_text = text.split('[/INST] ')[-1]
    # find the first "\n" and use the text before it
    cleaned_text = cleaned_text.split('\n')[0]
    return cleaned_text

In [108]:
print(gad_df.iloc[2]['mistral_8x7b_instruct_ten_shot'])
print("====================================")
print(mistral_parser_re_gad(gad_df.iloc[2]['mistral_8x7b_instruct_ten_shot']))

[INST] You are a helpful assistant to perform the following task.
"TASK: the task is to classify relations between a disease and a gene for a sentence."
"INPUT: the input is a sentence where the disease is labeled as @DISEASE$ and the gene is labeled as @GENE$ accordingly in a sentence. "
"OUTPUT: your task is to select one out of the two types of relations (0 and 1) for the gene and disease without any explanation or other characters: 
0, no relations 
1, has relations"

Our results support the hypothesis that @GENE$ genotype affects etiology and outcome of a variety of childhood @DISEASE$. [/INST] 0  [INST] genomic variations of @GENE$ are not likely to be substantially involved in the etiology of @DISEASE$. [/INST] 1  [INST] These data indicate that TGF-beta 1 and @GENE$ genes are not loci influencing @DISEASE$ susceptibility, either RR/SPMS or PPMS, in this population. [/INST] 0  [INST] Our results through Meta-analysis did not support the association between @GENE$ null genotype a

In [109]:
gad_df['mistral_8x7b_instruct_one_shot'] = gad_df['mistral_8x7b_instruct_one_shot'].apply(mistral_parser_re_gad)
gad_df['mistral_8x7b_instruct_five_shot'] = gad_df['mistral_8x7b_instruct_five_shot'].apply(mistral_parser_re_gad)
gad_df['mistral_8x7b_instruct_ten_shot'] = gad_df['mistral_8x7b_instruct_ten_shot'].apply(mistral_parser_re_gad)
gad_df['mistral_8x7b_instruct_twenty_shot'] = gad_df['mistral_8x7b_instruct_twenty_shot'].apply(mistral_parser_re_gad)

In [110]:
# convert some strings to int while considering failed LLM outputs as 'No relation (0)'
gad_df['mistral_8x7b_instruct_one_shot'] = gad_df['mistral_8x7b_instruct_one_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['mistral_8x7b_instruct_five_shot'] = gad_df['mistral_8x7b_instruct_five_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['mistral_8x7b_instruct_ten_shot'] = gad_df['mistral_8x7b_instruct_ten_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['mistral_8x7b_instruct_twenty_shot'] = gad_df['mistral_8x7b_instruct_twenty_shot'].apply(lambda x: int(x) if x.isdigit() else 0)

In [100]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# gad_df = pd.read_csv("data/RE/GAD/test_200_mistral_8x7b_instruct_results.csv")

In [111]:
y_true = gad_df['labels'].tolist()
y_pred = gad_df['mistral_8x7b_instruct_one_shot'].tolist()
print(f"F1 Score One Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['mistral_8x7b_instruct_five_shot'].tolist()
print(f"F1 Score Five Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['mistral_8x7b_instruct_ten_shot'].tolist()
print(f"F1 Score Ten Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['mistral_8x7b_instruct_twenty_shot'].tolist()
print(f"F1 Score Twenty Shot: {f1_score(y_true, y_pred, average='macro')}")

F1 Score One Shot: 0.4590358704719533
F1 Score Five Shot: 0.44431929113163626
F1 Score Ten Shot: 0.3869455463397043
F1 Score Twenty Shot: 0.3288590604026846


In [112]:
print(f"Average Mistral-8x7B-Instruct one-shot prediction time: {gad_df['mistral_8x7b_instruct_one_shot_time'].mean():.2f} seconds")
print(f"Average Mistral-8x7B-Instruct five-shot prediction time: {gad_df['mistral_8x7b_instruct_five_shot_time'].mean():.2f} seconds")
print(f"Average Mistral-8x7B-Instruct ten-shot prediction time: {gad_df['mistral_8x7b_instruct_ten_shot_time'].mean():.2f} seconds")
print(f"Average Mistral-8x7B-Instruct twenty-shot prediction time: {gad_df['mistral_8x7b_instruct_twenty_shot_time'].mean():.2f} seconds")

Average Mistral-8x7B-Instruct one-shot prediction time: 1.52 seconds
Average Mistral-8x7B-Instruct five-shot prediction time: 4.07 seconds
Average Mistral-8x7B-Instruct ten-shot prediction time: 5.57 seconds
Average Mistral-8x7B-Instruct twenty-shot prediction time: 1.25 seconds


In [113]:
# save the inference results
gad_df.to_csv('data/RE/GAD/test_200_mistral_8x7b_instruct_results.csv', index=False)