In [76]:
import time
import openai
openai.api_key = "sk-xxxxxxxxxxxxxxxxxx"

import pandas as pd
from tqdm import tqdm
from sklearn.metrics import f1_score
from utils import html_parsing_ncbi, html_parsing_n2c2, get_classification_report, get_digit, get_macro_average_f1

# 1. NER (Named Entity Recognition)

## 1.1 NCBI-Disease Dataset

### 1.1.1 Inference

In [2]:
ncbi_df = pd.read_csv('data/NER/NCBI-disease/test_200.csv')
ncbi_example_df = pd.read_csv('data/NER/NCBI-disease/examples.csv')

In [3]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to extract disease entities in a sentence."
"INPUT: the input is a sentence."
"OUTPUT: the output is an HTML that highlights all the disease entities in the sentence. The highlighting should only use HTML tags <span style=\"background-color: #FFFF00\"> and </span> and no other tags."
"""
def get_ner_ncbi_disease(sentence: str, gpt4: bool = False, shot: int = 0) -> str:
    """
        Get NER prediction from GPT-3.5 or GPT-4 given a sentence in NCBI-disease dataset and some examples
        Input:
            sentence: a string of sentence
            gpt4: whether to use GPT-4 or GPT-3.5
            shot: number of examples to use
        Output:
            a HTML string that highlights all the disease entities in the sentence
    """

    prompt = [
        {
            "role": "system", 
            "content": system_message
        }
    ]
    for i in range(shot):
        prompt.append(
            {
                "role": "user", 
                "content": ncbi_example_df.iloc[i]['text']
            }
        )
        prompt.append(
            {
                "role": "assistant",
                "content": ncbi_example_df.iloc[i]['label_text']
            }
        )
    prompt.append(
        {
            "role": "user", 
            "content": sentence
        }
    )

    gpt = "gpt-4-1106-preview" if gpt4 else "gpt-3.5-turbo-1106"

    retries = 10 # retry at most 10 times until it succeeds
    while retries > 0:
        try:
            time_start = time.time()
            response = openai.ChatCompletion.create(
                model = gpt,
                messages = prompt,
                temperature = 0.0, # deterministic
                request_timeout = 60,
                max_tokens = 4096,
                n = 1,
                seed = 42,
                top_p = 0.95,
            )
            time_end = time.time()
            return response['choices'][0]['message']['content'], time_end - time_start
        except Exception as e:
            print(f"Exception: {e}")
            print(f"Retrying... {retries} retries left")
            retries -= 1
            time.sleep(30)
            continue

    raise SystemExit("Max retries exceeded, exiting program")

In [None]:
for i in tqdm(range(0, len(ncbi_df), 1)):
    ncbi_df.loc[i, 'html_gpt3.5_one_shot'], ncbi_df.loc[i, 'gpt3.5_one_shot_time'] = get_ner_ncbi_disease(ncbi_df.iloc[i]['text'], gpt4=False, shot=1)
    ncbi_df.loc[i, 'html_gpt4_one_shot'], ncbi_df.loc[i, 'gpt4_one_shot_time'] = get_ner_ncbi_disease(ncbi_df.iloc[i]['text'], gpt4=True, shot=1)
    ncbi_df.loc[i, 'html_gpt3.5_five_shot'], ncbi_df.loc[i, 'gpt3.5_five_shot_time'] = get_ner_ncbi_disease(ncbi_df.iloc[i]['text'], gpt4=False, shot=5)
    ncbi_df.loc[i, 'html_gpt4_five_shot'], ncbi_df.loc[i, 'gpt4_five_shot_time'] = get_ner_ncbi_disease(ncbi_df.iloc[i]['text'], gpt4=True, shot=5)
    ncbi_df.loc[i, 'html_gpt3.5_ten_shot'], ncbi_df.loc[i, 'gpt3.5_ten_shot_time'] = get_ner_ncbi_disease(ncbi_df.iloc[i]['text'], gpt4=False, shot=10)
    ncbi_df.loc[i, 'html_gpt4_ten_shot'], ncbi_df.loc[i, 'gpt4_ten_shot_time'] = get_ner_ncbi_disease(ncbi_df.iloc[i]['text'], gpt4=True, shot=10)
    ncbi_df.loc[i, 'html_gpt3.5_twenty_shot'], ncbi_df.loc[i, 'gpt3.5_twenty_shot_time'] = get_ner_ncbi_disease(ncbi_df.iloc[i]['text'], gpt4=False, shot=20)
    ncbi_df.loc[i, 'html_gpt4_twenty_shot'], ncbi_df.loc[i, 'gpt4_twenty_shot_time'] = get_ner_ncbi_disease(ncbi_df.iloc[i]['text'], gpt4=True, shot=20)

In [6]:
# drop 89th prediction because Gemini is not able to predict it due to safety filter
ncbi_df.drop([89], inplace=True)

### 1.1.2 Evaluation

In [None]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# ncbi_df = pd.read_csv("data/NER/NCBI-disease/test_200_gpt_results.csv")

In [7]:
ncbi_df['gt_labels'], ncbi_df['gpt3.5_one_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gpt3.5_one_shot')
_, ncbi_df['gpt4_one_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gpt4_one_shot')
_, ncbi_df['gpt3.5_five_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gpt3.5_five_shot')
_, ncbi_df['gpt4_five_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gpt4_five_shot')
_, ncbi_df['gpt3.5_ten_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gpt3.5_ten_shot')
_, ncbi_df['gpt4_ten_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gpt4_ten_shot')
_, ncbi_df['gpt3.5_twenty_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gpt3.5_twenty_shot')
_, ncbi_df['gpt4_twenty_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gpt4_twenty_shot')

In [10]:
print(f"F1 Score One Shot GPT 3.5 (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'gpt3.5_one_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1 Score One Shot GPT 4 (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'gpt4_one_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1 Score Five Shot GPT 3.5 (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'gpt3.5_five_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1 Score Five Shot GPT 4 (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'gpt4_five_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1 Score Ten Shot GPT 3.5 (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'gpt3.5_ten_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1 Score Ten Shot GPT 4 (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'gpt4_ten_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1 Score Twenty Shot GPT 3.5 (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'gpt3.5_twenty_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1 Score Twenty Shot GPT 4 (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'gpt4_twenty_shot_labels', 'strict')['default']['f1-score']}")

F1 Score One Shot GPT 3.5 (Strict): 0.576271186440678
F1 Score One Shot GPT 4 (Strict): 0.6251691474966171
F1 Score Five Shot GPT 3.5 (Strict): 0.4444444444444445
F1 Score Five Shot GPT 4 (Strict): 0.6582278481012659
F1 Score Ten Shot GPT 3.5 (Strict): 0.40259740259740256
F1 Score Ten Shot GPT 4 (Strict): 0.7035830618892509
F1 Score Twenty Shot GPT 3.5 (Strict): 0.4507042253521127
F1 Score Twenty Shot GPT 4 (Strict): 0.7229299363057323


In [11]:
print(f"F1 Score One Shot GPT 3.5 (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'gpt3.5_one_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1 Score One Shot GPT 4 (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'gpt4_one_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1 Score Five Shot GPT 3.5 (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'gpt3.5_five_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1 Score Five Shot GPT 4 (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'gpt4_five_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1 Score Ten Shot GPT 3.5 (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'gpt3.5_ten_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1 Score Ten Shot GPT 4 (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'gpt4_ten_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1 Score Twenty Shot GPT 3.5 (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'gpt3.5_twenty_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1 Score Twenty Shot GPT 4 (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'gpt4_twenty_shot_labels', 'lenient')['default']['f1-score']}")

F1 Score One Shot GPT 3.5 (Lenient): 0.7196870925684485
F1 Score One Shot GPT 4 (Lenient): 0.814614343707713
F1 Score Five Shot GPT 3.5 (Lenient): 0.5925925925925926
F1 Score Five Shot GPT 4 (Lenient): 0.8164556962025317
F1 Score Ten Shot GPT 3.5 (Lenient): 0.5584415584415584
F1 Score Ten Shot GPT 4 (Lenient): 0.8208469055374592
F1 Score Twenty Shot GPT 3.5 (Lenient): 0.5492957746478873
F1 Score Twenty Shot GPT 4 (Lenient): 0.8343949044585988


In [13]:
print(f"Average GPT-3.5 one-shot prediction time: {ncbi_df['gpt3.5_one_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 one-shot prediction time: {ncbi_df['gpt4_one_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 five-shot prediction time: {ncbi_df['gpt3.5_five_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 five-shot prediction time: {ncbi_df['gpt4_five_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 ten-shot prediction time: {ncbi_df['gpt3.5_ten_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 ten-shot prediction time: {ncbi_df['gpt4_ten_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 twenty-shot prediction time: {ncbi_df['gpt3.5_twenty_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 twenty-shot prediction time: {ncbi_df['gpt4_twenty_shot_time'].mean():.2f} seconds")

Average GPT-3.5 one-shot prediction time: 2.46 seconds
Average GPT-4 one-shot prediction time: 4.80 seconds
Average GPT-3.5 five-shot prediction time: 2.26 seconds
Average GPT-4 five-shot prediction time: 4.15 seconds
Average GPT-3.5 ten-shot prediction time: 2.49 seconds
Average GPT-4 ten-shot prediction time: 4.08 seconds
Average GPT-3.5 twenty-shot prediction time: 2.47 seconds
Average GPT-4 twenty-shot prediction time: 4.29 seconds


In [14]:
# save the inference results
ncbi_df.to_csv('data/NER/NCBI-disease/test_200_gpt_results.csv', index=False)

# 1.2 2018 n2c2 Dataset

### 1.2.1 Inference

In [15]:
n2c2_df = pd.read_csv('data/NER/2018_n2c2/test_200.csv')
n2c2_example_df = pd.read_csv('data/NER/2018_n2c2/examples.csv')

In [16]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to extract disease entities in a sentence. The entity type includes Form, Route, Frequency, Dosage, Strength, Duration, Reason, Ade, Drug."
"INPUT: the input is a sentence."
"OUTPUT: the output is an HTML that highlights all the disease entities in the sentence in different colors: Form(#FF0000), Route(#FFA500), Frequency(#FFFF00), Dosage(#00FF00), Strength(#0000FF), Duration(#800080), Reason(#FFC0CB), Ade(#964B00), Drug(#808080) in hex code. The highlighting should only use HTML tags <span style=\"background-color: #XXXXXX\"> and </span> and no other tags.
"""
def get_ner_2018_n2c2(sentence: str, gpt4: bool = False, shot: int = 0) -> str:
    """
        Get NER prediction from GPT-3.5 or GPT-4 given a sentence in 2018 n2c2 dataset.
        Input:
            sentence: a string of sentence
            gpt4: whether to use GPT-4 or GPT-3.5
            shot: number of examples to use
        Output:
            a HTML string that highlights all the disease entities in the sentence in different colors
    """
    
    prompt = [
        {
            "role": "system", 
            "content": system_message
        }
    ]
    for i in range(shot):
        prompt.append(
            {
                "role": "user", 
                "content": n2c2_example_df.iloc[i]['text']
            }
        )
        prompt.append(
            {
                "role": "assistant",
                "content": n2c2_example_df.iloc[i]['label_text']
            }
        )
    prompt.append(
        {
            "role": "user", 
            "content": sentence
        }
    )
    
    gpt = "gpt-4-1106-preview" if gpt4 else "gpt-3.5-turbo-1106"

    retries = 10 # retry at most 10 times until it succeeds
    while retries > 0:
        try:
            time_start = time.time()
            response = openai.ChatCompletion.create(
                model = gpt,
                messages = prompt,
                temperature = 0.0, # deterministic
                request_timeout = 60,
                max_tokens = 4096,
                n = 1,
                seed = 42,
                top_p = 0.95,
            )
            time_end = time.time()
            return response['choices'][0]['message']['content'], time_end - time_start
        except Exception as e:
            print(f"Exception: {e}")
            print(f"Retrying... {retries} retries left")
            retries -= 1
            time.sleep(30)
            continue

    raise SystemExit("Max retries exceeded, exiting program")

In [None]:
for i in tqdm(range(0, len(n2c2_df), 1)):
    n2c2_df.loc[i, 'html_gpt3.5_one_shot'], n2c2_df.loc[i, 'gpt3.5_one_shot_time'] = get_ner_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=False, shot=1)
    n2c2_df.loc[i, 'html_gpt4_one_shot'], n2c2_df.loc[i, 'gpt4_one_shot_time'] = get_ner_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=True, shot=1)
    n2c2_df.loc[i, 'html_gpt3.5_five_shot'], n2c2_df.loc[i, 'gpt3.5_five_shot_time'] = get_ner_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=False, shot=5)
    n2c2_df.loc[i, 'html_gpt4_five_shot'], n2c2_df.loc[i, 'gpt4_five_shot_time'] = get_ner_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=True, shot=5)
    n2c2_df.loc[i, 'html_gpt3.5_ten_shot'], n2c2_df.loc[i, 'gpt3.5_ten_shot_time'] = get_ner_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=False, shot=10)
    n2c2_df.loc[i, 'html_gpt4_ten_shot'], n2c2_df.loc[i, 'gpt4_ten_shot_time'] = get_ner_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=True, shot=10)
    n2c2_df.loc[i, 'html_gpt3.5_twenty_shot'], n2c2_df.loc[i, 'gpt3.5_twenty_shot_time'] = get_ner_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=False, shot=20)
    n2c2_df.loc[i, 'html_gpt4_twenty_shot'], n2c2_df.loc[i, 'gpt4_twenty_shot_time'] = get_ner_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=True, shot=20)

### 1.2.2 Evaluation

In [61]:
# Optional: you can just load the llm output from the csv file instead of running the above code
n2c2_df = pd.read_csv("data/NER/2018_n2c2/test_200_gpt_results.csv")

In [64]:
n2c2_df['gt_labels'], n2c2_df['gpt3.5_one_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gpt3.5_one_shot')
_, n2c2_df['gpt4_one_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gpt4_one_shot')
_, n2c2_df['gpt3.5_five_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gpt3.5_five_shot')
_, n2c2_df['gpt4_five_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gpt4_five_shot')
_, n2c2_df['gpt3.5_ten_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gpt3.5_ten_shot')
_, n2c2_df['gpt4_ten_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gpt4_ten_shot')
_, n2c2_df['gpt3.5_twenty_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gpt3.5_twenty_shot')
_, n2c2_df['gpt4_twenty_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gpt4_twenty_shot')

In [71]:
print(f"F1 Score One Shot GPT 3.5 (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt3.5_one_shot_labels', 'strict'))}")
print(f"F1 Score Five Shot GPT 3.5 (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt3.5_five_shot_labels', 'strict'))}")
print(f"F1 Score Ten Shot GPT 3.5 (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt3.5_ten_shot_labels', 'strict'))}")
print(f"F1 Score Twenty Shot GPT 3.5 (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt3.5_twenty_shot_labels', 'strict'))}")
print(f"F1 Score One Shot GPT 3.5 (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt3.5_one_shot_labels', 'lenient'))}")
print(f"F1 Score Five Shot GPT 3.5 (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt3.5_five_shot_labels', 'lenient'))}")
print(f"F1 Score Ten Shot GPT 3.5 (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt3.5_ten_shot_labels', 'lenient'))}")
print(f"F1 Score Twenty Shot GPT 3.5 (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt3.5_twenty_shot_labels', 'lenient'))}")

F1 Score One Shot GPT 3.5 (Strict): 0.29077647291119374
F1 Score Five Shot GPT 3.5 (Strict): 0.4838384617950764
F1 Score Ten Shot GPT 3.5 (Strict): 0.5162000201626022
F1 Score Twenty Shot GPT 3.5 (Strict): 0.5650956959965368
F1 Score One Shot GPT 3.5 (Lenient): 0.42701067618461136
F1 Score Five Shot GPT 3.5 (Lenient): 0.6197381788075881
F1 Score Ten Shot GPT 3.5 (Lenient): 0.6435182915671569
F1 Score Twenty Shot GPT 3.5 (Lenient): 0.7052064310047981


In [72]:
print(f"F1 Score One Shot GPT 4 (Scrit): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt4_one_shot_labels', 'strict'))}")
print(f"F1 Score Five Shot GPT 4 (Scrit): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt4_five_shot_labels', 'strict'))}")
print(f"F1 Score Ten Shot GPT 4 (Scrit): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt4_ten_shot_labels', 'strict'))}")
print(f"F1 Score Twenty Shot GPT 4 (Scrit): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt4_twenty_shot_labels', 'strict'))}")

F1 Score One Shot GPT 4 (Scrit): 0.4750154328666599
F1 Score Five Shot GPT 4 (Scrit): 0.5816366473953379
F1 Score Ten Shot GPT 4 (Scrit): 0.5898720406929092
F1 Score Twenty Shot GPT 4 (Scrit): 0.616919788140017


In [66]:
print(f"F1 Score One Shot GPT 4 (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt4_one_shot_labels', 'lenient'))}")
print(f"F1 Score Five Shot GPT 4 (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt4_five_shot_labels', 'lenient'))}")
print(f"F1 Score Ten Shot GPT 4 Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt4_ten_shot_labels', 'lenient'))}")
print(f"F1 Score Twenty Shot GPT 4 (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gpt4_twenty_shot_labels', 'lenient'))}")

F1 Score One Shot GPT 4 (Lenient): 0.6770438979451868
F1 Score Five Shot GPT 4 (Lenient): 0.7500036015064712
F1 Score Ten Shot GPT 4 Lenient): 0.7518670422875415
F1 Score Twenty Shot GPT 4 (Lenient): 0.7497963260089207


In [23]:
print(f"Average GPT-3.5 one-shot prediction time: {n2c2_df['gpt3.5_one_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 one-shot prediction time: {n2c2_df['gpt4_one_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 five-shot prediction time: {n2c2_df['gpt3.5_five_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 five-shot prediction time: {n2c2_df['gpt4_five_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 ten-shot prediction time: {n2c2_df['gpt3.5_ten_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 ten-shot prediction time: {n2c2_df['gpt4_ten_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 twenty-shot prediction time: {n2c2_df['gpt3.5_twenty_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 twenty-shot prediction time: {n2c2_df['gpt4_twenty_shot_time'].mean():.2f} seconds")

Average GPT-3.5 one-shot prediction time: 4.74 seconds
Average GPT-4 one-shot prediction time: 7.91 seconds
Average GPT-3.5 five-shot prediction time: 4.30 seconds
Average GPT-4 five-shot prediction time: 8.22 seconds
Average GPT-3.5 ten-shot prediction time: 4.58 seconds
Average GPT-4 ten-shot prediction time: 8.42 seconds
Average GPT-3.5 twenty-shot prediction time: 4.70 seconds
Average GPT-4 twenty-shot prediction time: 8.42 seconds


In [24]:
n2c2_df.to_csv('data/NER/2018_n2c2/test_200_gpt_results.csv', index=False)

# 2. RE (Relation Extraction)

## 2.1 2018 n2c2 Dataset

### 2.1.1 Infernece

In [25]:
n2c2_df = pd.read_csv('data/RE/2018_n2c2/test_200.csv')
n2c2_example_df = pd.read_csv('data/RE/2018_n2c2/examples.csv')

In [26]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to classify relations for a sentence."
"INPUT: the input is a sentence where the entities are labeled within [E${X}] and [E${X}/] in a sentence, where X is an integer representing an unique entity."
"OUTPUT: your task is to select one out of the nine types of relations ('STRENGTH-DRUG', 'ROUTE-DRUG', 'FREQUENCY-DRUG', 'FORM-DRUG', 'DOSAGE-DRUG', 'REASON-DRUG', 'DURATION-DRUG', 'ADE-DRUG', and 'No relation')."
"""
def get_re_2018_n2c2(sentence: str, gpt4: bool = False, shot: int = 0) -> str:
    """
        Get RE prediction from GPT-3.5 or GPT-4 given a sentence in 2018 n2c2 dataset.
        Args:
            sentence: a string of sentence
            gpt4: whether to use GPT-4 or GPT-3.5
            shot: number of examples to use
        Output:
            a string of predicted relation
    """
    prompt = [
        {
            "role": "system", 
            "content": system_message
        }
    ]
    for i in range(shot):
        prompt.append(
            {
                "role": "user", 
                "content": n2c2_example_df.iloc[i]['text']
            }
        )
        prompt.append(
            {
                "role": "assistant",
                "content": n2c2_example_df.iloc[i]['labels']
            }
        )
    prompt.append(
        {
            "role": "user", 
            "content": sentence
        }
    )
    
    gpt = "gpt-4-1106-preview" if gpt4 else "gpt-3.5-turbo-1106"

    retries = 10 # retry at most 10 times until it succeeds
    while retries > 0:
        try:
            time_start = time.time()
            response = openai.ChatCompletion.create(
                model = gpt,
                messages = prompt,
                temperature = 0.0, # deterministic
                request_timeout = 60,
                max_tokens = 4096,
                n = 1,
                seed = 42,
                top_p = 0.95,
            )
            time_end = time.time()
            return response['choices'][0]['message']['content'], time_end - time_start
        except:
            print(f"Retrying... {retries} retries left")
            retries -= 1
            time.sleep(30)
            continue

    raise SystemExit("Max retries exceeded, exiting program")

In [None]:
for i in tqdm(range(0, len(n2c2_df), 1)):
    n2c2_df.loc[i, 'gpt3.5_one_shot'], n2c2_df.loc[i, 'gpt3.5_one_shot_time'] = get_re_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=False, shot=1)
    n2c2_df.loc[i, 'gpt4_one_shot'], n2c2_df.loc[i, 'gpt4_one_shot_time'] = get_re_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=True, shot=1)
    n2c2_df.loc[i, 'gpt3.5_five_shot'], n2c2_df.loc[i, 'gpt3.5_five_shot_time'] = get_re_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=False, shot=5)
    n2c2_df.loc[i, 'gpt4_five_shot'], n2c2_df.loc[i, 'gpt4_five_shot_time'] = get_re_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=True, shot=5)
    n2c2_df.loc[i, 'gpt3.5_ten_shot'], n2c2_df.loc[i, 'gpt3.5_ten_shot_time'] = get_re_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=False, shot=10)
    n2c2_df.loc[i, 'gpt4_ten_shot'], n2c2_df.loc[i, 'gpt4_ten_shot_time'] = get_re_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=True, shot=10)
    n2c2_df.loc[i, 'gpt3.5_twenty_shot'], n2c2_df.loc[i, 'gpt3.5_twenty_shot_time'] = get_re_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=False, shot=20)
    n2c2_df.loc[i, 'gpt4_twenty_shot'], n2c2_df.loc[i, 'gpt4_twenty_shot_time'] = get_re_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=True, shot=20)

### 2.1.2 Evaluation

In [29]:
# get rid of ' ' if any
n2c2_df['gpt3.5_one_shot'] = n2c2_df['gpt3.5_one_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['gpt4_one_shot'] = n2c2_df['gpt4_one_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['gpt3.5_five_shot'] = n2c2_df['gpt3.5_five_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['gpt4_five_shot'] = n2c2_df['gpt4_five_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['gpt3.5_ten_shot'] = n2c2_df['gpt3.5_ten_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['gpt4_ten_shot'] = n2c2_df['gpt4_ten_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['gpt3.5_twenty_shot'] = n2c2_df['gpt3.5_twenty_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['gpt4_twenty_shot'] = n2c2_df['gpt4_twenty_shot'].apply(lambda x: x[1:-1] if "'" in x else x)

In [40]:
# get digit label while considering failed LLM outputs as 'No relation'
n2c2_df['labels'] = n2c2_df['labels'].apply(get_digit)
n2c2_df['gpt3.5_one_shot_labels'] = n2c2_df['gpt3.5_one_shot'].apply(get_digit)
n2c2_df['gpt4_one_shot_labels'] = n2c2_df['gpt4_one_shot'].apply(get_digit)
n2c2_df['gpt3.5_five_shot_labels'] = n2c2_df['gpt3.5_five_shot'].apply(get_digit)
n2c2_df['gpt4_five_shot_labels'] = n2c2_df['gpt4_five_shot'].apply(get_digit)
n2c2_df['gpt3.5_ten_shot_labels'] = n2c2_df['gpt3.5_ten_shot'].apply(get_digit)
n2c2_df['gpt4_ten_shot_labels'] = n2c2_df['gpt4_ten_shot'].apply(get_digit)
n2c2_df['gpt3.5_twenty_shot_labels'] = n2c2_df['gpt3.5_twenty_shot'].apply(get_digit)
n2c2_df['gpt4_twenty_shot_labels'] = n2c2_df['gpt4_twenty_shot'].apply(get_digit)

In [74]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# n2c2_df = pd.read_csv("data/RE/2018_n2c2/test_200_gpt_results.csv")

In [60]:
y_true = n2c2_df['labels'].tolist()
y_pred = n2c2_df['gpt3.5_one_shot_labels'].tolist()
print(f"F1 Score GPT 3.5 One Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['gpt3.5_five_shot_labels'].tolist()
print(f"F1 Score GPT 3.5 Five Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['gpt3.5_ten_shot_labels'].tolist()
print(f"F1 Score GPT 3.5 Ten Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['gpt3.5_twenty_shot_labels'].tolist()
print(f"F1 Score GPT 3.5 Twenty Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['gpt4_one_shot_labels'].tolist()
print(f"F1 Score GPT 4 One Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['gpt4_five_shot_labels'].tolist()
print(f"F1 Score GPT 4 Five Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['gpt4_ten_shot_labels'].tolist()
print(f"F1 Score GPT 4 Ten Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['gpt4_twenty_shot_labels'].tolist()
print(f"F1 Score GPT 4 Twenty Shot: {f1_score(y_true, y_pred, average='macro')}")

F1 Score GPT 3.5 One Shot: 0.1647954574783843
F1 Score GPT 3.5 Five Shot: 0.16888155123449242
F1 Score GPT 3.5 Ten Shot: 0.22564148687169894
F1 Score GPT 3.5 Twenty Shot: 0.2905089036809467
F1 Score GPT 4 One Shot: 0.6069838797527586
F1 Score GPT 4 Five Shot: 0.7454425063120715
F1 Score GPT 4 Ten Shot: 0.8522045130740783
F1 Score GPT 4 Twenty Shot: 0.8821989810361903


In [42]:
print(f"Average GPT-3.5 one-shot prediction time: {n2c2_df['gpt3.5_one_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 one-shot prediction time: {n2c2_df['gpt4_one_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 five-shot prediction time: {n2c2_df['gpt3.5_five_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 five-shot prediction time: {n2c2_df['gpt4_five_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 ten-shot prediction time: {n2c2_df['gpt3.5_ten_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 ten-shot prediction time: {n2c2_df['gpt4_ten_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 twenty-shot prediction time: {n2c2_df['gpt3.5_twenty_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 twenty-shot prediction time: {n2c2_df['gpt4_twenty_shot_time'].mean():.2f} seconds")

Average GPT-3.5 one-shot prediction time: 3.15 seconds
Average GPT-4 one-shot prediction time: 1.28 seconds
Average GPT-3.5 five-shot prediction time: 2.50 seconds
Average GPT-4 five-shot prediction time: 0.81 seconds
Average GPT-3.5 ten-shot prediction time: 3.24 seconds
Average GPT-4 ten-shot prediction time: 0.75 seconds
Average GPT-3.5 twenty-shot prediction time: 3.10 seconds
Average GPT-4 twenty-shot prediction time: 0.90 seconds


In [43]:
# save the inference results
n2c2_df.to_csv('data/RE/2018_n2c2/test_200_gpt_results.csv', index=False)

## 2.2 GAD

### 2.2.1 Inference

In [44]:
gad_df = pd.read_csv('data/RE/GAD/test_200.csv')
gad_example_df = pd.read_csv('data/RE/GAD/examples.csv')

In [77]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to classify relations between a disease and a gene for a sentence."
"INPUT: the input is a sentence where the disease is labeled as @DISEASE$ and the gene is labeled as @GENE$ accordingly in a sentence. "
"OUTPUT: your task is to select one out of the two types of relations (0 and 1) for the gene and disease without any explanation or other characters:
0, no relations
1, has relations"
"""
def get_re_gad(sentence: str, gpt4: bool = False, shot: int = 0) -> str:
    """
        Get RE prediction from GPT-3.5 or GPT-4 given a sentence in GAD dataset.
        Args:
            sentence: a string of sentence
            gpt4: whether to use GPT-4 or GPT-3.5
            shot: number of examples to use
        Output:
            a string of predicted relation
    """
    prompt = [
        {
            "role": "system", 
            "content": system_message,
        }
    ]
    for i in range(shot):
        prompt.append(
            {
                "role": "user", 
                "content": gad_example_df.iloc[i]['text']
            }
        )
        prompt.append(
            {
                "role": "assistant",
                "content": str(gad_example_df.iloc[i]['labels'])
            }
        )
    prompt.append(
        {
            "role": "user", 
            "content": sentence
        }
    )
    
    gpt = "gpt-4-1106-preview" if gpt4 else "gpt-3.5-turbo-1106"

    retries = 10 # retry at most 10 times until it succeeds
    while retries > 0:
        try:
            time_start = time.time()
            response = openai.ChatCompletion.create(
                model = gpt,
                messages = prompt,
                temperature = 0.0, # deterministic
                request_timeout = 60,
                max_tokens = 4096,
                n = 1,
                seed = 42,
                top_p = 0.95,
            )
            time_end = time.time()
            return response['choices'][0]['message']['content'], time_end - time_start
        except:
            print(f"Retrying... {retries} retries left")
            retries -= 1
            time.sleep(30)
            continue

    raise SystemExit("Max retries exceeded, exiting program")

In [None]:
for i in tqdm(range(86, len(gad_df), 1)):
    gad_df.loc[i, 'gpt3.5_one_shot'], gad_df.loc[i, 'gpt3.5_one_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], gpt4=False, shot=1)
    gad_df.loc[i, 'gpt4_one_shot'], gad_df.loc[i, 'gpt4_one_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], gpt4=True, shot=1)
    gad_df.loc[i, 'gpt3.5_five_shot'], gad_df.loc[i, 'gpt3.5_five_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], gpt4=False, shot=5)
    gad_df.loc[i, 'gpt4_five_shot'], gad_df.loc[i, 'gpt4_five_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], gpt4=True, shot=5)
    gad_df.loc[i, 'gpt3.5_ten_shot'], gad_df.loc[i, 'gpt3.5_ten_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], gpt4=False, shot=10)
    gad_df.loc[i, 'gpt4_ten_shot'], gad_df.loc[i, 'gpt4_ten_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], gpt4=True, shot=10)
    gad_df.loc[i, 'gpt3.5_twenty_shot'], gad_df.loc[i, 'gpt3.5_twenty_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], gpt4=False, shot=20)
    gad_df.loc[i, 'gpt4_twenty_shot'], gad_df.loc[i, 'gpt4_twenty_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], gpt4=True, shot=20)

### 2.2.2 Evaluation

In [81]:
# convert some strings to int while considering failed LLM outputs as 'No relation (0)'
gad_df['gpt3.5_one_shot_label'] = gad_df['gpt3.5_one_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['gpt4_one_shot_label'] = gad_df['gpt4_one_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['gpt3.5_five_shot_label'] = gad_df['gpt3.5_five_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['gpt4_five_shot_label'] = gad_df['gpt4_five_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['gpt3.5_ten_shot_label'] = gad_df['gpt3.5_ten_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['gpt4_ten_shot_label'] = gad_df['gpt4_ten_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['gpt3.5_twenty_shot_label'] = gad_df['gpt3.5_twenty_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['gpt4_twenty_shot_label'] = gad_df['gpt4_twenty_shot'].apply(lambda x: int(x) if x.isdigit() else 0)

In [None]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# gad_df = pd.read_csv("data/RE/GAD/test_200_gpt_results.csv")

In [86]:
y_true = gad_df['labels'].tolist()
y_pred = gad_df['gpt3.5_one_shot_label'].tolist()
print(f"F1 Score GPT 3.5 One Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['gpt3.5_five_shot_label'].tolist()
print(f"F1 Score GPT 3.5 Five Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['gpt3.5_ten_shot_label'].tolist()
print(f"F1 Score GPT 3.5 Ten Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['gpt3.5_twenty_shot_label'].tolist()
print(f"F1 Score GPT 3.5 Twenty Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['gpt4_one_shot_label'].tolist()
print(f"F1 Score GPT 4 One Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['gpt4_five_shot_label'].tolist()
print(f"F1 Score GPT 4 Five Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['gpt4_ten_shot_label'].tolist()
print(f"F1 Score GPT 4 Ten Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['gpt4_twenty_shot_label'].tolist()
print(f"F1 Score GPT 4 Twenty Shot: {f1_score(y_true, y_pred, average='macro')}")

F1 Score GPT 3.5 One Shot: 0.36755952380952384
F1 Score GPT 3.5 Five Shot: 0.4241788958770091
F1 Score GPT 3.5 Ten Shot: 0.4851199670476779
F1 Score GPT 3.5 Twenty Shot: 0.46464305205436734
F1 Score GPT 4 One Shot: 0.4206928513497856
F1 Score GPT 4 Five Shot: 0.39929631854458075
F1 Score GPT 4 Ten Shot: 0.5416887008637405
F1 Score GPT 4 Twenty Shot: 0.5434253246753247


In [87]:
print(f"Average GPT-3.5 one-shot prediction time: {gad_df['gpt3.5_one_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 one-shot prediction time: {gad_df['gpt4_one_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 five-shot prediction time: {gad_df['gpt3.5_five_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 five-shot prediction time: {gad_df['gpt4_five_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 ten-shot prediction time: {gad_df['gpt3.5_ten_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 ten-shot prediction time: {gad_df['gpt4_ten_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 twenty-shot prediction time: {gad_df['gpt3.5_twenty_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 twenty-shot prediction time: {gad_df['gpt4_twenty_shot_time'].mean():.2f} seconds")

Average GPT-3.5 one-shot prediction time: 2.61 seconds
Average GPT-4 one-shot prediction time: 0.72 seconds
Average GPT-3.5 five-shot prediction time: 2.32 seconds
Average GPT-4 five-shot prediction time: 0.74 seconds
Average GPT-3.5 ten-shot prediction time: 2.11 seconds
Average GPT-4 ten-shot prediction time: 0.68 seconds
Average GPT-3.5 twenty-shot prediction time: 2.42 seconds
Average GPT-4 twenty-shot prediction time: 0.81 seconds


In [88]:
# save the inference results
gad_df.to_csv('data/RE/GAD/test_200_gpt_results.csv', index=False)