In [25]:
import os
import time
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

import pandas as pd
from tqdm import tqdm
from typing import List
from sklearn.metrics import classification_report
from utils import html_parsing_ncbi, html_parsing_n2c2, get_classification_report, get_digit

# 1. NER (Named Entity Recognition)

## 1.1 NCBI-Disease Dataset

### 1.1.1 Inference

In [2]:
ncbi_df = pd.read_csv('data/NER/NCBI-disease/test_200.csv')

In [8]:
def get_ner_ncbi_disease(sentence: str, gpt4: bool = False, shot: int = 0) -> str:
    """
        Get NER zero/one-shot prediction from GPT-3.5 or GPT-4 given a sentence in NCBI-disease dataset.
        Input:
            sentence: a string of sentence
            gpt4: whether to use GPT-4 or GPT-3.5
            shot: zero-shot (0) or one-shot (1)
        Output:
            a HTML string that highlights all the disease entities in the sentence
    """

    prompt = [
        {
            "role": "system", 
            "content": "TASK: the task is to extract disease entities in a sentence."
                       "INPUT: the input is a sentence."
                       "OUTPUT: the output is an HTML that highlights all the disease entities in the sentence. \
                                The highlighting should only use HTML tags <span style=\"background-color: #FFFF00\"> and </span> and no other tags."
        }
    ]
    if shot == 1: # the example is from NCBI-disease dataset's training split
        prompt.append(
            {
                "role": "user", 
                "content": "In summary , inactivation of the murine ATP7B gene produces a form of cirrhotic liver disease that resembles Wilson disease in humans and the toxic milk phenotype in the mouse . ."
            }
        )
        prompt.append(
            {
                "role": "assistant",
                "content": 'In summary , inactivation of the murine ATP7B gene produces a form of <span style="background-color: #FFFF00">cirrhotic liver disease</span> \
                            that resembles <span style="background-color: #FFFF00">Wilson disease</span> in humans and the toxic milk phenotype in the mouse . .'
            }
        )
    prompt.append(
        {
            "role": "user", 
            "content": sentence
        }
    )

    gpt = "gpt-4-0613" if gpt4 else "gpt-3.5-turbo-0613"

    retries = 10 # retry at most 10 times until it succeeds
    while retries > 0:
        try:
            time_start = time.time()
            response = openai.ChatCompletion.create(
                model = gpt,
                messages = prompt,
                temperature = 0.0, # deterministic
                request_timeout = 60,
            )
            time_end = time.time()
            return response['choices'][0]['message']['content'], time_end - time_start
        except Exception as e:
            print(f"Exception: {e}")
            print(f"Retrying... {retries} retries left")
            retries -= 1
            time.sleep(30)
            continue

    raise SystemExit("Max retries exceeded, exiting program")

In [None]:
for i in tqdm(range(0, len(ncbi_df), 1)):
    ncbi_df.loc[i, 'html_gpt3.5_zero_shot'], ncbi_df.loc[i, 'gpt3.5_zero_shot_time'] = get_ner_ncbi_disease(ncbi_df.iloc[i]['text'], gpt4=False, shot=0)
    ncbi_df.loc[i, 'html_gpt4_zero_shot'], ncbi_df.loc[i, 'gpt4_zero_shot_time'] = get_ner_ncbi_disease(ncbi_df.iloc[i]['text'], gpt4=True, shot=0)
    ncbi_df.loc[i, 'html_gpt3.5_one_shot'], ncbi_df.loc[i, 'gpt3.5_one_shot_time'] = get_ner_ncbi_disease(ncbi_df.iloc[i]['text'], gpt4=False, shot=1)
    ncbi_df.loc[i, 'html_gpt4_one_shot'], ncbi_df.loc[i, 'gpt4_one_shot_time'] = get_ner_ncbi_disease(ncbi_df.iloc[i]['text'], gpt4=True, shot=1)

### 1.1.2 Evaluation

In [11]:
ncbi_df['gt_labels'], ncbi_df['gpt3.5_zero_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gpt3.5_zero_shot')
_, ncbi_df['gpt4_zero_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gpt4_zero_shot')
_, ncbi_df['gpt3.5_one_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gpt3.5_one_shot')
_, ncbi_df['gpt4_one_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gpt4_one_shot')

In [12]:
get_classification_report(ncbi_df, 'gt_labels', 'gpt4_one_shot_labels', 'strict')

defaultdict(<function ner_metrics.ner_metrics.classifcation_report.<locals>.<lambda>()>,
            {'default': defaultdict(<function ner_metrics.ner_metrics.classifcation_report.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'precision': 0.7705627705627706,
                          'recall': 0.52046783625731,
                          'f1-score': 0.6212914485165794})})

In [13]:
get_classification_report(ncbi_df, 'gt_labels', 'gpt4_one_shot_labels', 'lenient')

defaultdict(<function ner_metrics.ner_metrics.classifcation_report.<locals>.<lambda>()>,
            {'default': defaultdict(<function ner_metrics.ner_metrics.classifcation_report.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'precision': 0.948051948051948,
                          'recall': 0.6403508771929824,
                          'f1-score': 0.7643979057591623})})

In [14]:
print(f"Average GPT-3.5 zero-shot prediction time: {ncbi_df['gpt3.5_zero_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 zero-shot prediction time: {ncbi_df['gpt4_zero_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 one-shot prediction time: {ncbi_df['gpt3.5_one_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 one-shot prediction time: {ncbi_df['gpt4_one_shot_time'].mean():.2f} seconds")

Average GPT-3.5 zero-shot prediction time: 6.67 seconds
Average GPT-4 zero-shot prediction time: 5.39 seconds
Average GPT-3.5 one-shot prediction time: 6.62 seconds
Average GPT-4 one-shot prediction time: 5.04 seconds


In [15]:
# save the inference results
ncbi_df.to_csv('data/NER/NCBI-disease/test_200_gpt_results.csv', index=False)

# 1.2 2018 n2c2 Dataset

### 1.2.1 Inference

In [16]:
n2c2_df = pd.read_csv('data/NER/2018_n2c2/test_200.csv')

In [26]:
def get_ner_2018_n2c2(sentence: str, gpt4: bool = False, shot: int = 0) -> str:
    """
        Get NER zero/one-shot prediction from GPT-3.5 or GPT-4 given a sentence in 2018 n2c2 dataset.
        Input:
            sentence: a string of sentence
            gpt4: whether to use GPT-4 or GPT-3.5
            shot: zero-shot (0) or one-shot (1)
        Output:
            a HTML string that highlights all the disease entities in the sentence in different colors
    """
    
    prompt = [
        {
            "role": "system", 
            "content": "TASK: the task is to extract disease entities in a sentence. The entity type includes Form, Route, Frequency, Dosage, Strength, Duration, Reason, Ade, Drug."
                       "INPUT: the input is a sentence."
                       "OUTPUT: the output is an HTML that highlights all the disease entities in the sentence in different colors: Form(#FF0000), Route(#FFA500), Frequency(#FFFF00), Dosage(#00FF00), Strength(#0000FF), Duration(#800080), Reason(#FFC0CB), Ade(#964B00), Drug(#808080) in hex code. \
                                The highlighting should only use HTML tags <span style=\"background-color: #XXXXXX\"> and </span> and no other tags."
        }
    ]
    if shot == 1:
        prompt.append(
            {
                "role": "user", 
                "content": "Vitamin D 400 unit Tablet Sig : Two ( 2 ) Tablet PO once a day ."
            }
        )
        prompt.append(
            {
                "role": "assistant",
                "content": '<span style="background-color: #808080">Vitamin D</span> <span style="background-color: #0000FF">400 unit</span> <span style="background-color: #FF0000">Tablet</span> Sig : <span style="background-color: #00FF00">Two ( 2 )</span> <span style="background-color: #FF0000">Tablet</span> <span style="background-color: #FFA500">PO</span> <span style="background-color: #FFFF00">once a day</span> .'
            }
        )
    prompt.append(
        {
            "role": "user", 
            "content": sentence
        }
    )
    
    gpt = "gpt-4-0613" if gpt4 else "gpt-3.5-turbo-0613"

    retries = 10 # retry at most 10 times until it succeeds
    while retries > 0:
        try:
            time_start = time.time()
            response = openai.ChatCompletion.create(
                model = gpt,
                messages = prompt,
                temperature = 0.0, # deterministic
                request_timeout = 60,
            )
            time_end = time.time()
            return response['choices'][0]['message']['content'], time_end - time_start
        except Exception as e:
            print(f"Exception: {e}")
            print(f"Retrying... {retries} retries left")
            retries -= 1
            time.sleep(30)
            continue

    raise SystemExit("Max retries exceeded, exiting program")

In [None]:
for i in tqdm(range(0, len(n2c2_df), 1)):
    n2c2_df.loc[i, 'html_gpt3.5_zero_shot'], n2c2_df.loc[i, 'gpt3.5_zero_shot_time'] = get_ner_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=False, shot=0)
    n2c2_df.loc[i, 'html_gpt4_zero_shot'], n2c2_df.loc[i, 'gpt4_zero_shot_time'] = get_ner_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=True, shot=0)
    n2c2_df.loc[i, 'html_gpt3.5_one_shot'], n2c2_df.loc[i, 'gpt3.5_one_shot_time'] = get_ner_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=False, shot=1)
    n2c2_df.loc[i, 'html_gpt4_one_shot'], n2c2_df.loc[i, 'gpt4_one_shot_time'] = get_ner_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=True, shot=1)

### 1.2.2 Evaluation

In [28]:
n2c2_df['gt_labels'], n2c2_df['gpt3.5_zero_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gpt3.5_zero_shot')
_, n2c2_df['gpt4_zero_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gpt4_zero_shot')
_, n2c2_df['gpt3.5_one_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gpt3.5_one_shot')
_, n2c2_df['gpt4_one_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gpt4_one_shot')

In [29]:
get_classification_report(n2c2_df, 'gt_labels', 'gpt4_one_shot_labels', 'strict')

defaultdict(<function ner_metrics.ner_metrics.classifcation_report.<locals>.<lambda>()>,
            {'Form': defaultdict(<function ner_metrics.ner_metrics.classifcation_report.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'precision': 0.9285714285714286,
                          'recall': 0.8210526315789474,
                          'f1-score': 0.8715083798882681}),
             'Duration': defaultdict(<function ner_metrics.ner_metrics.classifcation_report.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'precision': 0.35,
                          'recall': 0.5,
                          'f1-score': 0.4117647058823529}),
             'Ade': defaultdict(<function ner_metrics.ner_metrics.classifcation_report.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'precision': 0.07352941176470588,
                          'recall': 0.7142857142857143,
                          'f1-score': 0.13333333333333333}),
             'Dosage': def

In [30]:
get_classification_report(n2c2_df, 'gt_labels', 'gpt4_one_shot_labels', 'lenient')

defaultdict(<function ner_metrics.ner_metrics.classifcation_report.<locals>.<lambda>()>,
            {'Form': defaultdict(<function ner_metrics.ner_metrics.classifcation_report.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'precision': 0.9761904761904762,
                          'recall': 0.8631578947368421,
                          'f1-score': 0.9162011173184357}),
             'Duration': defaultdict(<function ner_metrics.ner_metrics.classifcation_report.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'precision': 0.65,
                          'recall': 0.9285714285714286,
                          'f1-score': 0.7647058823529412}),
             'Ade': defaultdict(<function ner_metrics.ner_metrics.classifcation_report.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'precision': 0.10294117647058823,
                          'recall': 1.0,
                          'f1-score': 0.18666666666666665}),
             'Dosage': def

In [31]:
print(f"Average GPT-3.5 zero-shot prediction time: {n2c2_df['gpt3.5_zero_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 zero-shot prediction time: {n2c2_df['gpt4_zero_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 one-shot prediction time: {n2c2_df['gpt3.5_one_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 one-shot prediction time: {n2c2_df['gpt4_one_shot_time'].mean():.2f} seconds")

Average GPT-3.5 zero-shot prediction time: 8.09 seconds
Average GPT-4 zero-shot prediction time: 6.40 seconds
Average GPT-3.5 one-shot prediction time: 8.63 seconds
Average GPT-4 one-shot prediction time: 8.21 seconds


In [32]:
n2c2_df.to_csv('data/NER/2018_n2c2/test_200_gpt_results.csv', index=False)

# 2. RE (Relation Extraction)

## 2.1 2018 n2c2 Dataset

### 2.1.1 Infernece

In [33]:
n2c2_df = pd.read_csv('data/ER/2018_n2c2/test_200.csv')

In [35]:
def get_re_2018_n2c2(sentence: str, gpt4: bool = False, shot: int = 0) -> str:
    """
        Get RE zero/one-shot prediction from GPT-3.5 or GPT-4 given a sentence in 2018 n2c2 dataset.
        Args:
            sentence: a string of sentence
            gpt4: whether to use GPT-4 or GPT-3.5
            shot: zero-shot (0) or one-shot (1)
        Output:
            a string of predicted relation
    """
    prompt = [
        {
            "role": "system", 
            "content": "TASK: the task is to classify relations for a sentence."
                       "INPUT: the input is a sentence where the entities are labeled within [E${X}] and [E${X}/] in a sentence, where X is an integer representing an unique entity."
                       "OUTPUT: your task is to select one out of the nine types of relations ('STRENGTH-DRUG', 'ROUTE-DRUG', 'FREQUENCY-DRUG', 'FORM-DRUG', 'DOSAGE-DRUG', \
                               'REASON-DRUG', 'DURATION-DRUG', 'ADE-DRUG', and 'No relation')."
        }
    ]
    if shot == 1:
        prompt.append(
            {
                "role": "user", 
                "content": "[E2] Docusate/Sodium [E2/] ( Liquid ) 100/mg PO BID/:/PRN [E1] constipation [E1/] 4 ."
            }
        )
        prompt.append(
            {
                "role": "assistant",
                "content": 'REASON-DRUG'
            }
        )
    prompt.append(
        {
            "role": "user", 
            "content": sentence
        }
    )
    
    gpt = "gpt-4-0613" if gpt4 else "gpt-3.5-turbo-0613"

    retries = 10 # retry at most 10 times until it succeeds
    while retries > 0:
        try:
            time_start = time.time()
            response = openai.ChatCompletion.create(
                model = gpt,
                messages = prompt,
                temperature = 0.0, # deterministic
                request_timeout = 15,
            )
            time_end = time.time()
            return response['choices'][0]['message']['content'], time_end - time_start
        except:
            print(f"Retrying... {retries} retries left")
            retries -= 1
            time.sleep(30)
            continue

    raise SystemExit("Max retries exceeded, exiting program")

In [None]:
for i in tqdm(range(0, len(n2c2_df), 1)):
    n2c2_df.loc[i, 'gpt3.5_zero_shot'], n2c2_df.loc[i, 'gpt3.5_zero_shot_time'] = get_re_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=False, shot=0)
    n2c2_df.loc[i, 'gpt4_zero_shot'], n2c2_df.loc[i, 'gpt4_zero_shot_time'] = get_re_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=True, shot=0)
    n2c2_df.loc[i, 'gpt3.5_one_shot'], n2c2_df.loc[i, 'gpt3.5_one_shot_time'] = get_re_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=False, shot=1)
    n2c2_df.loc[i, 'gpt4_one_shot'], n2c2_df.loc[i, 'gpt4_one_shot_time'] = get_re_2018_n2c2(n2c2_df.iloc[i]['text'], gpt4=True, shot=1)

### 2.1.2 Evaluation

In [37]:
# get rid of ' ' if any
n2c2_df['gpt3.5_zero_shot'] = n2c2_df['gpt3.5_zero_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['gpt4_zero_shot'] = n2c2_df['gpt4_zero_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['gpt3.5_one_shot'] = n2c2_df['gpt3.5_one_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['gpt4_one_shot'] = n2c2_df['gpt4_one_shot'].apply(lambda x: x[1:-1] if "'" in x else x)

In [38]:
# get digit label while considering failed LLM outputs as 'No relation'
n2c2_df['labels'] = n2c2_df['labels'].apply(get_digit)
n2c2_df['gpt3.5_zero_shot_labels'] = n2c2_df['gpt3.5_zero_shot'].apply(get_digit)
n2c2_df['gpt4_zero_shot_labels'] = n2c2_df['gpt4_zero_shot'].apply(get_digit)
n2c2_df['gpt3.5_one_shot_labels'] = n2c2_df['gpt3.5_one_shot'].apply(get_digit)
n2c2_df['gpt4_one_shot_labels'] = n2c2_df['gpt4_one_shot'].apply(get_digit)

In [39]:
y_true = n2c2_df['labels'].tolist()
y_pred = n2c2_df['gpt4_one_shot_labels'].tolist()
print(classification_report(y_true, y_pred, digits=2))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       130
           1       1.00      0.23      0.38        13
           2       0.82      0.75      0.78        12
           3       1.00      0.82      0.90        11
           4       0.83      0.91      0.87        11
           5       0.48      1.00      0.65        10
           6       1.00      0.33      0.50         6
           7       0.80      0.80      0.80         5
           8       0.67      1.00      0.80         2

    accuracy                           0.89       200
   macro avg       0.84      0.76      0.74       200
weighted avg       0.92      0.89      0.88       200



In [40]:
print(f"Average GPT-3.5 zero-shot prediction time: {n2c2_df['gpt3.5_zero_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 zero-shot prediction time: {n2c2_df['gpt4_zero_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 one-shot prediction time: {n2c2_df['gpt3.5_one_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 one-shot prediction time: {n2c2_df['gpt4_one_shot_time'].mean():.2f} seconds")

Average GPT-3.5 zero-shot prediction time: 0.50 seconds
Average GPT-4 zero-shot prediction time: 0.96 seconds
Average GPT-3.5 one-shot prediction time: 0.47 seconds
Average GPT-4 one-shot prediction time: 0.87 seconds


In [41]:
# save the inference results
n2c2_df.to_csv('data/ER/2018_n2c2/test_200_gpt_results.csv', index=False)

## 2.2 GAD

### 2.2.1 Inference

In [42]:
gad_df = pd.read_csv('data/ER/GAD/test_200.csv')

In [43]:
def get_re_gad(sentence: str, gpt4: bool = False, shot: int = 0) -> str:
    """
        Get RE zero/one-shot prediction from GPT-3.5 or GPT-4 given a sentence in GAD dataset.
        Args:
            sentence: a string of sentence
            gpt4: whether to use GPT-4 or GPT-3.5
            shot: zero-shot (0) or one-shot (1)
        Output:
            a string of predicted relation
    """
    prompt = [
        {
            "role": "system", 
            "content": "TASK: the task is to classify relations between a disease and a gene for a sentence."
                       "INPUT: the input is a sentence where the disease is labeled as @DISEASE$ and the gene is labeled as @GENE$ accordingly in a sentence. "
                       "OUTPUT: your task is to select one out of the two types of relations (0 and 1) for the gene and disease without any explanation or other characters: \n \
                                0, no relations \n \
                                1, has relations"
        }
    ]
    if shot == 1:
        prompt.append(
            {
                "role": "user", 
                "content": "We found evidence for association between @GENE$ and COGA @DISEASE$, history of blackouts, age at first drunkenness, and level of response to alcohol."
            }
        )
        prompt.append(
            {
                "role": "assistant",
                "content": '1'
            }
        )
    prompt.append(
        {
            "role": "user", 
            "content": sentence
        }
    )
    
    gpt = "gpt-4-0613" if gpt4 else "gpt-3.5-turbo-0613"

    retries = 10 # retry at most 10 times until it succeeds
    while retries > 0:
        try:
            time_start = time.time()
            response = openai.ChatCompletion.create(
                model = gpt,
                messages = prompt,
                temperature = 0.0, # deterministic
                request_timeout = 15,
            )
            time_end = time.time()
            return response['choices'][0]['message']['content'], time_end - time_start
        except:
            print(f"Retrying... {retries} retries left")
            retries -= 1
            time.sleep(30)
            continue

    raise SystemExit("Max retries exceeded, exiting program")

In [None]:
for i in tqdm(range(0, len(gad_df), 1)):
    gad_df.loc[i, 'gpt3.5_zero_shot'], gad_df.loc[i, 'gpt3.5_zero_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], gpt4=False, shot=0)
    gad_df.loc[i, 'gpt4_zero_shot'], gad_df.loc[i, 'gpt4_zero_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], gpt4=True, shot=0)
    gad_df.loc[i, 'gpt3.5_one_shot'], gad_df.loc[i, 'gpt3.5_one_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], gpt4=False, shot=1)
    gad_df.loc[i, 'gpt4_one_shot'], gad_df.loc[i, 'gpt4_one_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], gpt4=True, shot=1)

### 2.2.2 Evaluation

In [45]:
# convert some strings to int while considering failed LLM outputs as 'No relation (0)'
gad_df['gpt3.5_zero_shot'] = gad_df['gpt3.5_zero_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['gpt4_zero_shot'] = gad_df['gpt4_zero_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['gpt3.5_one_shot'] = gad_df['gpt3.5_one_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['gpt4_one_shot'] = gad_df['gpt4_one_shot'].apply(lambda x: int(x) if x.isdigit() else 0)

In [46]:
y_true = gad_df['labels'].tolist()
y_pred = gad_df['gpt4_one_shot'].tolist()
print(classification_report(y_true, y_pred, digits=2))

              precision    recall  f1-score   support

           0       0.46      0.26      0.33        98
           1       0.50      0.72      0.59       102

    accuracy                           0.49       200
   macro avg       0.48      0.49      0.46       200
weighted avg       0.48      0.49      0.46       200



In [47]:
print(f"Average GPT-3.5 zero-shot prediction time: {gad_df['gpt3.5_zero_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 zero-shot prediction time: {gad_df['gpt4_zero_shot_time'].mean():.2f} seconds")
print(f"Average GPT-3.5 one-shot prediction time: {gad_df['gpt3.5_one_shot_time'].mean():.2f} seconds")
print(f"Average GPT-4 one-shot prediction time: {gad_df['gpt4_one_shot_time'].mean():.2f} seconds")

Average GPT-3.5 zero-shot prediction time: 0.44 seconds
Average GPT-4 zero-shot prediction time: 0.76 seconds
Average GPT-3.5 one-shot prediction time: 0.44 seconds
Average GPT-4 one-shot prediction time: 0.65 seconds


In [48]:
# save the inference results
gad_df.to_csv('data/ER/GAD/test_200_gpt_results.csv', index=False)