# Task 3 - BEST TEST

This code won't run without the BEST TEST, which is not included here. It would only take small adapatations to expand this to any multiple choice test that you'd like to give to the GPT API.

In [1]:
import openai
import time
import pandas as pd
import numpy as np
import requests.exceptions
import random
import re

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline

print(f'{pd.__version__=}, {np.__version__=}, {matplotlib.__version__=}')

from io import StringIO

from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

pd.__version__='1.5.3', np.__version__='1.24.2', matplotlib.__version__='3.7.1'


Edit the code below to wherever your API key lives.

In [2]:
openai.api_key_path = 'api_key'

Models:

- gpt-4-0314 : Snapshot of gpt-4 from March 14th 2023. 8k model. \$0.0300 per 1k tokens (15x cost of 3.5). Max 8,192 tokens per request.
- gpt-3.5-turbo : most powerful (current) model, costs \$0.0020 per 1k tokens. Max 4,096 tokens per request. Current version is gpt-3.5-turbo-0301. Trained up to Sept 2021.
- ada : fastest model, costs \$0.0004 per 1k tokens (1/5 of 3.5). Max 2,049 tokens per request. Trained up to Oct 2019.
- babbage : slightly slower than ada, but more nuanced. Costs \$0.0005 per 1k tokens. Max 2,049 tokens per request. Trained up to Oct 2019.
- curie : again slower, costs \$0.0020 per 1k tokens (same price as 3.5). Max 2,049 tokens per request. Trained up to Oct 2019.
- davinci : strongest 3.0 model, comparable to 3.5 turbo. Costs \$0.0200 per 1k tokens (10x cost of 3.5). Max 2,049 tokens per request. Trained up to Oct 2019.

In [3]:
with open('bst.txt') as f:
    text = f.read()

    # Split the text into entries using regular expressions
    entries = re.split(r'\d+\)', text)[1:]

    # Strip each entry and add it to a list
    question_list = [f'Question {i+1}: ' + entry.strip() for i, entry in enumerate(entries)]
print(f'Questions loaded. There are currently {len(question_list)} questions.')

Questions loaded. There are currently 20 questions.


## Test Parameters

Change the two cells below in order to determine which models are used and which testing parameters are used.

In [4]:
models = ['gpt-3.5-turbo-0301', 'gpt-4-0314']

In [5]:
batch_sizes = [1,5,20]
duplication_folds = 20

## Useful Functions

First function creates a `messages` list that will feed into the GPT API.

Second function cleans some logic to handle errors when the API is busy or throwing errors. Also tries to make sure that the GPT returned data in a table, as requested.

In [6]:
def get_messages(prompts):
    messages = [
        {'role': 'system', 'content': 'You are physician in training taking an exam.'},
        {'role': 'user', 'content': 'You are a physician studying internal medicine. As part of your training, you are studying good practices in blood banking. You will be presented with a multiple choice test. There may be as few as 1 question, or as many as 20 questions. Please format your answer as a csv that uses ; as the separator. Denote the start and end of your table with three tildes (~~~). Use `Question` and `Answer` as the columns in the table. In the `Question` column give only the number of the question. In the `Answer` column give your answer as a single letter corresponding to your choice in the multiple choice question.'},
        {'role': 'assistant', "content": "I understand. Please provide me with the questions.",},
        #{
        #    'role': 'user',
        #    'content': PROMPT_HERE
        #},
    ]
    for p in prompts:
        messages.append(
                {
            'role': 'user',
            'content': p
        }
            )

    return messages

In [7]:
def val_response(response):
    if response['choices'][0]['finish_reason'] != 'stop':
        raise Exception('Response failed- message incomplete')
            
    if len(response['choices'][0]['message']['content'].split('~~~')) < 2:
        print('\nError occurred with response. Likely misformatted:')
        print(response['choices'][0]['message']['content'])
        raise Exception('Response failed- message misformatted')
        
    return True

## Testing Loop

In [8]:
results = {m: {b: [] for b in batch_sizes} for m in models}
for model in models:
    for batch_size in batch_sizes:
        print(f'Starting with {model=}, {batch_size=}, {duplication_folds=}')
        
        token_count = 0
        start_time = time.time()
        for fold in range(duplication_folds):
            
            #print(f"Starting fold {fold}", end=" ")
            
            for i in range(0, len(question_list), batch_size):
                thous_tokens = token_count // 1000
                if 'gpt-4' in model:
                    cost = 0.03 * thous_tokens
                elif 'gpt-3.5' in model:
                    cost = 0.002 * thous_tokens
                else:
                    cost = float('inf')
                #print(f'{i=}, {thous_tokens=:.0f}k tokens ${cost:.3f}', end=' ')
                
                while True:
                    try:
                        response = openai.ChatCompletion.create(
                            model=model,
                            messages=get_messages(question_list[i:i+batch_size]),
                            temperature=0,
                        )
                        if val_response(response):
                            break
                    except (
                        openai.error.APIConnectionError,
                        requests.exceptions.Timeout,
                        requests.exceptions.ConnectionError,
                        openai.error.APIError,
                        openai.error.ServiceUnavailableError,
                        TimeoutError
                    ) as e:
                        print(f"\nConnection error occurred: {str(e)[:50]}: Retrying in 30 seconds...")
                        time.sleep(30)                
                    except Exception as e:
                        print(f"\nUnexpected error occurred: {str(e)[:50]}. Retrying in 30 seconds...")
                        time.sleep(30)
                        
                token_count += response['usage']['total_tokens']
                r = response['choices'][0]['message']['content'].split('~~~')[1].strip()
                results[model][batch_size].append(pd.read_csv(StringIO(r), sep=';'))
                #print(f"completed. Tokens: {response['usage']['total_tokens']}, time elapsed: {time.time()-start_time:.2f}s")
            #print(f'Finished fold {fold}. Total time elapsed: {time.time()-start_time:.2f}s')
            print('*',end='')
        print(f'\nTask completed on {model=} and {batch_size=} with {token_count} total tokens and a total cost of ${cost:.2f}.\n\tTotal time elapsed: {time.time()-start_time:.2f}s ({(time.time()-start_time)/60:.2f} minutes)\n')


Starting with model='gpt-3.5-turbo-0301', batch_size=1, duplication_folds=20
********
Unexpected error occurred: That model is currently overloaded with other requ. Retrying in 30 seconds...
***
Unexpected error occurred: That model is currently overloaded with other requ. Retrying in 30 seconds...
*********
Task completed on model='gpt-3.5-turbo-0301' and batch_size=1 with 128000 total tokens and a total cost of $0.25. Total time elapsed: 578.39s (9.64 minutes)
Starting with model='gpt-3.5-turbo-0301', batch_size=5, duplication_folds=20
********************
Task completed on model='gpt-3.5-turbo-0301' and batch_size=5 with 70400 total tokens and a total cost of $0.14. Total time elapsed: 168.83s (2.81 minutes)
Starting with model='gpt-3.5-turbo-0301', batch_size=20, duplication_folds=20
********************
Task completed on model='gpt-3.5-turbo-0301' and batch_size=20 with 59600 total tokens and a total cost of $0.11. Total time elapsed: 115.23s (1.92 minutes)
Starting with model='gp

In [10]:
key = pd.read_csv('key_bst.csv')

In [37]:
all_dfs = []
raw_dfs = []
for bs in batch_sizes:
    for m in models:
        if 'gpt-3.5' in m:
            test_name = f'GPT3.5_batch_{bs}'
        elif 'gpt-4' in m:
            test_name = f'GPT4_batch_{bs}'
            
        dummy = pd.concat(results[m][bs], ignore_index=True)
        
        # Merge dummy with key on the Question column
        merged = pd.merge(dummy, key, on='Question')
        
        # Create a new column test_name that indicates whether the Answer matches the Key
        merged[test_name] = merged['Answer'] == merged['Key']
        
        # Convert True/False values to 1/0
        merged[test_name] = merged[test_name].astype(int)
        
        # Drop the 'Key' column and keep only the 'Question', 'Answer', and test_name columns
        merged = merged[['Question', 'Answer', test_name]]
        
        raw_dfs.append(merged)
        merged.to_csv(f'raw_{test_name}_results.csv')
        
        merged = merged.groupby(['Question']).agg({test_name: 'mean'}).reset_index()
        
        all_dfs.append(merged)

In [None]:
df = pd.merge(all_dfs[0], all_dfs[1], on='Question')

for i in range(2, len(all_dfs)):
    df = pd.merge(df, all_dfs[i], on='Question')
    
df