# Comparing Models

This notebook chooses the most appropriate prompt and prompt structure for the OCR correction. 

In [1]:
#import config  # Import your config.py file this contains you openai api key
import pandas as pd
import numpy as np
import os
from llm_comparison_toolkit import RateLimiter, get_response_openai, get_response_anthropic,  create_config_dict_func, compare_request_configurations, generate_model_configs
from evaluate import load
from evaluation_funcs import evaluate_correction_performance, evaluate_correction_performance_folders, get_metric_error_reduction
import seaborn as sns
import matplotlib.pyplot as plt
from helper_functions import files_to_df_func, files_to_df_core_func
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import re


dev_data_folder = 'data/dev_data'
dev_transcripts = os.path.join(dev_data_folder, 'dev_data_transcript')
dev_raw_ocr_folder =  os.path.join(dev_data_folder,'dev_raw_ocr' )
dev_system_message_folder = os.path.join(dev_data_folder,'dev_system_message_variants' )


#NCSE

ncse_folder = 'data/transcription_returned_ocr'
ncse_articles_raw = os.path.join(ncse_folder, 'transcription_raw_ocr')
ncse_articles_transcribed = os.path.join(ncse_folder, 'transcription_files') 
ncse_articles_results = os.path.join(ncse_folder, 'corrected_folder')

#Overproof
overproof_folder = 'data/overproof'

smh_folder =  os.path.join(overproof_folder, 'SMH')
smh_articles_raw = os.path.join(smh_folder, 'article_level', 'raw')
smh_articles_transcribed = os.path.join(smh_folder, 'article_level', 'corrected') #in the dataset they are reffered to as corrected but this clashes with my naming convention
smh_articles_results = os.path.join(smh_folder, 'results')

ca_folder =  os.path.join(overproof_folder, 'CA')
ca_articles_raw = os.path.join(ca_folder, 'article_level', 'raw')
ca_articles_transcribed = os.path.join(ca_folder, 'article_level', 'corrected') #in the dataset they are reffered to as corrected but this clashes with my naming convention
ca_articles_results = os.path.join(ca_folder, 'results')

#load the dev and test sets for prompt development and selection
dev_data_df = pd.read_csv(os.path.join(dev_data_folder,'dev_data_raw.csv'))


#for saving data to be used in the analysis
if not os.path.exists('data/analysis'):
    os.makedirs('data/analysis')


wer = load("wer")
cer = load("cer")


model_name_code = pd.Series(
    {'Llama 2 70B':'llama2-70b-4096',
 'Gemma 7B':'gemma-7b-it',
 'Opus':'claude-3-opus-20240229',
 'Haiku':'claude-3-haiku-20240307',
 'GPT-4':'gpt-4-turbo-preview',
 'GPT-3.5':'gpt-3.5-turbo',
 'Mixtral 8x7B':'mixtral-8x7b-32768',
 'Overproof':'overproof'})


eval_metric = 'CER'

## Evaluate system prompt tests

We evaluate the system prompts below to see if thre is any significant difference between the prompts


# Evaluating models on the test set

Having identified two different prompts and that the prompts appear to work better when places after the text we can now compare the different models


The below code creates the basic configuration dictionaries for each model and then fills in the with the two different prompt messages creating a single list of all basic prompt/model configurations. It then calls all the LLM's and saves the results.
This works in series so takes a while.

## Create API configurations

In [2]:
#Create the prompt/system message using the best performing from the previous section

full_prompt = "{content_html}"+f""" \n \n """ + f"""You are an expert in post-OCR correction of documents. Using the context available from the text please recover the most likely original text from the corrupted OCR. The text is from an english newspaper in the 1800's. The text may be an advert or article and may be missing the beggining or end. Do not add any text, commentary, or lead in sentences beyond the recovered text. Do not add a title, or any introductions."""

instruct_prompt = "{content_html}"+f""" \n \n """ + f"""You are an expert in post-OCR correction of documents. Using the context available from the text please recover the most likely original text from the corrupted OCR. Do not add any text, commentary, or lead in sentences beyond the recovered text. Do not add a title, or any introductions."""

boros_basic  = "{content_html}"+f""" \n \n """ +"Correct the text"

boros_complex  ="{content_html}"+f""" \n \n """ + f"""Please assist with reviewing and correcting errors in texts produced by automatic transcription (OCR) of historical documents.
Your task is to carefully examine the following text and correct any mistakes introduced by the OCR software. 
Do not write anything else than the corrected text."""

In [3]:
groq_alt_endpoint = {'alt_endpoint':{'base_url':'https://api.groq.com/openai/v1',
                     'api_key':os.getenv("GROQ_API_KEY")}}

basic_model_configs = pd.DataFrame({
    'get_response_func': [get_response_openai, get_response_openai, get_response_anthropic, get_response_anthropic, 
                          get_response_openai,# get_response_openai, 
                          get_response_openai], 
    'engine': ['gpt-3.5-turbo', 'gpt-4-turbo-preview', "claude-3-haiku-20240307", "claude-3-opus-20240229", 
               'mixtral-8x7b-32768',#'llama2-70b-4096', 
               'gemma-7b-it'],
    'rate_limit':[160e3, 80e3, 100e3, 40e3, 9e3, #15e3,
                   15e3],
    'additional_args': [
        {}, {}, {}, {}, 
        groq_alt_endpoint, 
        #groq_alt_endpoint, 
        groq_alt_endpoint
    ]
})

full_model_configs = generate_model_configs(basic_model_configs, full_prompt, 'full')
instruct_model_configs = generate_model_configs(basic_model_configs, instruct_prompt, 'instruct')

#I think on reflection I only need to compare boros complex on gpt-4 as this was the best performer in their paper
boros_configs = [
    (get_response_openai, 'gpt-4-turbo-preview', boros_complex, "boros_complex_"),
   # (get_response_openai, 'gpt-4-turbo-preview', boros_basic, "boros_basic_"),
  #  (get_response_anthropic, "claude-3-opus-20240229", boros_complex, "boros_complex_")
]

boros_list = [
    create_config_dict_func(
        get_response_func=config[0],
        rate_limiter=RateLimiter(80e3),
        engine=config[1],
        system_message_template="",
        prompt_template=config[2],
        additional_args={"response_name": config[3]}
    )
    for config in boros_configs
]

model_configs = full_model_configs + instruct_model_configs + boros_list



## Perform all API calls

The below section is what actually calls the API, the code points to the folders where the raw OCR is and provides a path to where the corrected text should be stored.

### NCSE

In [4]:
corrected_folder = ncse_articles_results
#
# This naming business needs to be cleaned up so the actual article ID is used. until then just have the following mess
#
# 

test_data_new = pd.read_csv(os.path.join(dev_data_folder,'transcription_raw_ocr.csv'))
test_data_new = test_data_new.loc[test_data_new ['file_name'].isin(files_to_df_func(ncse_articles_transcribed )['file_name'])] #subset to just the data I have transcribed

#This goes through the list of articles that have been transcribed, checks to see if there is a corrected version and if not generates it
compare_request_configurations(test_data_new, model_configs, folder_path=corrected_folder)

2024-04-30 07:07:56 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-30 07:07:57 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-30 07:08:00 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-30 07:08:02 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-30 07:08:03 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-30 07:08:18 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-30 07:08:25 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-30 07:08:27 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-30 07:08:30 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 

### Boros etal re-evaluation

The post-OCR correction worked so well that the Boros etal prompt is being re-evaluated

In [6]:
boros_complex  ="{content}"+f""" \n \n """ + f"""Please assist with reviewing and correcting errors in texts produced by automatic transcription (OCR) of historical documents.
Your task is to carefully examine the following text and correct any mistakes introduced by the OCR software. 
Do not write anything else than the corrected text."""


boros_config = generate_model_configs(basic_model_configs.iloc[0:2, :], boros_complex, 'boros_complex_')[1]

### Sydney Morning Herald

This section performs the correction test on dataset 2 of the Overproof collection. This is data from the Sydney Morning Herald. In addition it re-tests the Boros et al.

In [7]:
smh_data = files_to_df_core_func(smh_articles_raw )

smh_data['content'] = smh_data['content'].str.replace('\n', ' ')

smh_data['id'] = smh_data['file_name'] # this is needed as the processing log uses an id to keep track of what has been processed and what hasn't to allow for easy restarts

full_prompt_smh = "{content}"+f""" \n \n """ + f"""You are an expert in post-OCR correction of documents. Using the context available from the text please recover the most likely original text from the corrupted OCR. The text is from The Sydney Morning Herald 1842 -1950. The text may be an advert or article and may be missing the beggining or end. Do not add any text, commentary, or lead in sentences beyond the recovered text. Do not add a title, or any introductions."""

instruct_prompt_smh = "{content}"+f""" \n \n """ + f"""You are an expert in post-OCR correction of documents. Using the context available from the text please recover the most likely original text from the corrupted OCR. Do not add any text, commentary, or lead in sentences beyond the recovered text. Do not add a title, or any introductions."""

full_model_configs_smh = generate_model_configs(basic_model_configs, full_prompt_smh, 'full')
instruct_model_configs_smh = generate_model_configs(basic_model_configs, instruct_prompt_smh, 'instruct')

#Boros et al prompt added in as the overall system works so well, it seems strange theirs didn't work, this is a quick check
smh_configs = full_model_configs_smh + instruct_model_configs_smh  + [boros_config ]

corrected_folder_smh = smh_articles_results

compare_request_configurations(smh_data, smh_configs, folder_path=corrected_folder_smh)

2024-04-28 17:40:25 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-28 17:40:30 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-28 17:40:43 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-28 17:40:52 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-28 17:41:04 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-04-28 17:41:29 httpx INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


KeyboardInterrupt: 

### Chronicalling America

This section performs the correction test on dataset 2 of the Overproof collection. This is data from the Chronicalling America Dataset. 

In [8]:
ca_data = files_to_df_core_func(ca_articles_raw )

ca_data['content'] = ca_data['content'].str.replace('\n', ' ')

ca_data['id'] = ca_data['file_name'] # this is needed as the processing log uses an id to keep track of what has been processed and what hasn't to allow for easy restarts

full_prompt_ca = "{content}"+f""" \n \n """ + f"""You are an expert in post-OCR correction of documents. Using the context available from the text please recover the most likely original text from the corrupted OCR. The text is from American Newspapers 1870 -1922. The text may be an advert or article and may be missing the beggining or end. Do not add any text, commentary, or lead in sentences beyond the recovered text. Do not add a title, or any introductions."""

instruct_prompt_ca = "{content}"+f""" \n \n """ + f"""You are an expert in post-OCR correction of documents. Using the context available from the text please recover the most likely original text from the corrupted OCR. Do not add any text, commentary, or lead in sentences beyond the recovered text. Do not add a title, or any introductions."""


full_model_configs_ca = generate_model_configs(basic_model_configs, full_prompt_ca, 'full')
instruct_model_configs_ca = generate_model_configs(basic_model_configs, instruct_prompt_ca, 'instruct')

ca_configs = full_model_configs_ca + instruct_model_configs_ca  + [boros_config ]

corrected_folder_ca = ca_articles_results

compare_request_configurations(ca_data, ca_configs, folder_path=corrected_folder_ca)

KeyboardInterrupt: 

## Evaluate the prompts across all models

On the smaller models, Full is worse than instruct on the larger models the reverse. Maybe this is related to ability to 'focus' or hold isntructions in memory?

In [20]:
ncse_performance_eval.groupby('type').size()

type
boros_complex__gpt-4-turbo-preview    78
full__claude-3-haiku-20240307         78
full__claude-3-opus-20240229          78
full__gemma-7b-it                     78
full__gpt-3.5-turbo                   78
full__gpt-4-turbo-preview             78
full__llama2-70b-4096                 51
full__mixtral-8x7b-32768              78
instruct__claude-3-haiku-20240307     78
instruct__claude-3-opus-20240229      78
instruct__gemma-7b-it                 78
instruct__gpt-3.5-turbo               78
instruct__gpt-4-turbo-preview         78
instruct__llama2-70b-4096             50
instruct__mixtral-8x7b-32768          78
dtype: int64

In [19]:
##
## double check get_metric_error_reduction
##

corrected_folder = ncse_articles_results 

gt_folder = ncse_articles_transcribed 

raw_ocr = ncse_articles_raw

ncse_performance_eval =  evaluate_correction_performance_folders(corrected_folder, gt_folder, wer, cer)

ncse_raw_ocr_eval =  evaluate_correction_performance(raw_ocr, gt_folder , wer, cer, 'raw_ocr')


ncse_error_reduction = get_metric_error_reduction(ncse_performance_eval, ncse_raw_ocr_eval )

ncse_error_reduction.groupby('type').describe().filter(regex = '50|median').round(2).sort_values((eval_metric, '50%'))

Unnamed: 0_level_0,WER,CER,lev_dist
Unnamed: 0_level_1,50%,50%,50%
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
full__llama2-70b-4096,56.83,-28.27,-27.33
instruct__llama2-70b-4096,53.24,-11.2,-12.44
full__gemma-7b-it,23.18,-0.75,-1.11
instruct__gemma-7b-it,24.63,1.43,0.77
full__mixtral-8x7b-32768,48.7,6.03,5.47
instruct__mixtral-8x7b-32768,47.38,7.06,6.81
full__claude-3-haiku-20240307,48.91,17.54,16.93
instruct__claude-3-haiku-20240307,54.74,27.32,27.1
full__gpt-3.5-turbo,61.27,37.62,36.78
instruct__gpt-3.5-turbo,63.03,38.71,38.03


In [8]:
corrected_folder = smh_articles_results 

gt_folder = smh_articles_transcribed 

raw_ocr = smh_articles_raw

smh_performance_eval =  evaluate_correction_performance_folders(corrected_folder, gt_folder, wer, cer)

smh_raw_ocr_eval =  evaluate_correction_performance(raw_ocr, gt_folder , wer, cer, 'raw_ocr')

smh_error_reduction = get_metric_error_reduction(smh_performance_eval, smh_raw_ocr_eval )

smh_error_reduction.groupby('type').describe().filter(regex = '50|median').round(2).sort_values((eval_metric, '50%'))

Unnamed: 0_level_0,WER,CER,lev_dist
Unnamed: 0_level_1,50%,50%,50%
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
full__gemma-7b-it,3.26,-35.65,-35.34
instruct__mixtral-8x7b-32768,17.39,-19.11,-20.0
full__mixtral-8x7b-32768,18.0,-14.63,-15.24
instruct__gemma-7b-it,9.66,-12.93,-13.04
full__llama2-70b-4096,16.67,5.77,1.85
instruct__llama2-70b-4096,16.0,6.45,3.08
overproof,19.23,28.38,27.59
instruct__claude-3-haiku-20240307,26.92,35.71,33.8
full__claude-3-haiku-20240307,27.34,38.38,37.48
full__gpt-3.5-turbo,26.72,39.18,38.66


In [9]:
corrected_folder = ca_articles_results 

gt_folder = ca_articles_transcribed 

raw_ocr = ca_articles_raw

ca_performance_eval =  evaluate_correction_performance_folders(corrected_folder, gt_folder, wer, cer)

ca_raw_ocr_eval =  evaluate_correction_performance(raw_ocr, gt_folder , wer, cer, 'raw_ocr')

ca_error_reduction = get_metric_error_reduction(ca_performance_eval, ca_raw_ocr_eval )

ca_error_reduction.groupby('type').describe().filter(regex = '50|median').round(2).sort_values((eval_metric, '50%'))

Unnamed: 0_level_0,WER,CER,lev_dist
Unnamed: 0_level_1,50%,50%,50%
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
instruct__gemma-7b-it,5.66,-41.0,-42.38
full__gemma-7b-it,3.04,-38.01,-38.15
instruct__mixtral-8x7b-32768,13.72,-22.07,-22.36
full__mixtral-8x7b-32768,15.72,-16.3,-14.86
instruct__llama2-70b-4096,10.86,-6.48,-7.94
full__llama2-70b-4096,10.19,-4.9,-8.14
full__claude-3-haiku-20240307,14.97,26.01,25.27
overproof,21.63,34.59,34.59
instruct__claude-3-haiku-20240307,17.41,34.73,34.73
instruct__gpt-4-turbo-preview,20.98,37.57,37.45


In [21]:
ncse_median = ncse_error_reduction.groupby('type')[eval_metric].median().round(1)
smh_median = smh_error_reduction.groupby('type')[eval_metric].median().round(1)
ca_median = ca_error_reduction.groupby('type')[eval_metric].median().round(1)

# Combine the results into a new dataframe
result_df = pd.DataFrame({
    'NCSE': ncse_median,
    'SMH': smh_median,
    'CA': ca_median
})

# Reset the index to make 'type' a regular column
result_df = result_df.reset_index()

result_df = result_df.loc[~result_df['type'].isin(['boros_basic__gpt-4-turbo-preview',  'claude_temp_claude-3-opus-20240229'])]

result_df['model'] = result_df['type'].str.split('_').str[-1]
result_df['prompt'] = result_df['type'].str.split('_').str[0]

result_df = result_df.sort_values('NCSE').merge(model_name_code.reset_index().rename(columns={0: 'model'}), on='model').rename(columns={'index': 'Model'})


result_df[['Model', 'prompt', 'NCSE', 'SMH', 'CA']].sort_values(['NCSE'])

Unnamed: 0,Model,prompt,NCSE,SMH,CA
0,Llama 2 70B,full,-28.3,5.8,-4.9
1,Llama 2 70B,instruct,-11.2,6.5,-6.5
2,Gemma 7B,full,-0.7,-35.7,-38.0
3,Gemma 7B,instruct,1.4,-12.9,-41.0
4,Mixtral 8x7B,full,6.0,-14.6,-16.3
5,Mixtral 8x7B,instruct,7.1,-19.1,-22.1
6,Haiku,full,17.5,38.4,26.0
7,Haiku,instruct,27.3,35.7,34.7
8,GPT-3.5,full,37.6,39.2,44.2
9,GPT-3.5,instruct,38.7,42.9,44.1


In [22]:
def render_latex_with_formatting(df, caption, label):

    df = df.copy()
    # Format the 'Total' row by appending \textbf{} to each element
    #df.iloc[-1] = df.iloc[-1].apply(lambda x: '\\textbf{' + str(x) + '}')
    
    # Convert DataFrame to LaTeX
    latex_table = df.to_latex(
        index=False,
        float_format="%.2f" ,
        escape=False,  # Important to render LaTeX commands within the table properly
        column_format='p{5cm}cccc',  # One left-aligned column followed by four centered columns
        bold_rows=True,  # Bold the headers
        caption=caption,
        label=label
    )
    return latex_table


In [23]:
results_tab = render_latex_with_formatting(result_df.loc[result_df['prompt']!='instruct', ['Model',  'NCSE', 'SMH', 'CA']].sort_values(['Model']),   
                            'Model performance across the datasets measured in Error Reduction Percentage, higher is better.There is significant variation in how well the LMs are able to perform post-OCR correction, and significant differences between prompts for certain models.',
                            'tab:results'  )

print(results_tab)

\begin{table}
\caption{Model performance across the datasets measured in Error Reduction Percentage, higher is better.There is significant variation in how well the LMs are able to perform post-OCR correction, and significant differences between prompts for certain models.}
\label{tab:results}
\begin{tabular}{p{5cm}cccc}
\toprule
Model & NCSE & SMH & CA \\
\midrule
GPT-3.5 & 37.60 & 39.20 & 44.20 \\
GPT-4 & 56.70 & 42.10 & 38.20 \\
GPT-4 & 59.30 & 48.40 & 45.60 \\
GPT-4 & NaN & 56.10 & NaN \\
Gemma 7B & -0.70 & -35.70 & -38.00 \\
Haiku & 17.50 & 38.40 & 26.00 \\
Llama 2 70B & -28.30 & 5.80 & -4.90 \\
Mixtral 8x7B & 6.00 & -14.60 & -16.30 \\
Opus & 62.40 & 51.00 & 48.20 \\
Overproof & NaN & 28.40 & 34.60 \\
\bottomrule
\end{tabular}
\end{table}



In [24]:
results_tab = render_latex_with_formatting(result_df.loc[result_df['prompt']=='instruct', ['Model',  'NCSE', 'SMH', 'CA']].sort_values(['Model']), 
                             'Model performance across the datasets measured in Error Reduction Percentage, higher is better.There is significant variation in how well the LMs are able to perform post-OCR correction, and significant differences between prompts for certain models.',
                             'tab:results')

print(results_tab)

\begin{table}
\caption{Model performance across the datasets measured in Error Reduction Percentage, higher is better.There is significant variation in how well the LMs are able to perform post-OCR correction, and significant differences between prompts for certain models.}
\label{tab:results}
\begin{tabular}{p{5cm}cccc}
\toprule
Model & NCSE & SMH & CA \\
\midrule
GPT-3.5 & 38.70 & 42.90 & 44.10 \\
GPT-4 & 57.00 & 41.80 & 37.60 \\
Gemma 7B & 1.40 & -12.90 & -41.00 \\
Haiku & 27.30 & 35.70 & 34.70 \\
Llama 2 70B & -11.20 & 6.50 & -6.50 \\
Mixtral 8x7B & 7.10 & -19.10 & -22.10 \\
Opus & 61.40 & 45.50 & 47.00 \\
\bottomrule
\end{tabular}
\end{table}



In [14]:
ncse_median = ncse_error_reduction.groupby('type')[eval_metric].median().round(1)
smh_median = smh_error_reduction.groupby('type')[eval_metric].median().round(1)
ca_median = ca_error_reduction.groupby('type')[eval_metric].median().round(1)

# Combine the results into a new dataframe
result_df = pd.DataFrame({
    'NCSE': ncse_median,
    'SMH': smh_median,
    'CA': ca_median
})

# Reset the index to make 'type' a regular column
result_df = result_df.reset_index()

result_df = result_df.loc[~result_df['type'].isin(['boros_basic__gpt-4-turbo-preview',  'claude_temp_claude-3-opus-20240229'])]
result_df['model'] = result_df['type'].str.split('_').str[-1]
result_df['prompt'] = result_df['type'].str.split('_').str[0]

result_df

Unnamed: 0,type,NCSE,SMH,CA,model,prompt
1,boros_complex___gpt-4-turbo-preview,,56.1,,gpt-4-turbo-preview,boros
2,boros_complex__gpt-4-turbo-preview,59.3,48.4,45.6,gpt-4-turbo-preview,boros
3,full__claude-3-haiku-20240307,17.5,38.4,26.0,claude-3-haiku-20240307,full
4,full__claude-3-opus-20240229,62.4,51.0,48.2,claude-3-opus-20240229,full
5,full__gemma-7b-it,-0.7,-35.7,-38.0,gemma-7b-it,full
6,full__gpt-3.5-turbo,37.6,39.2,44.2,gpt-3.5-turbo,full
7,full__gpt-4-turbo-preview,56.7,42.1,38.2,gpt-4-turbo-preview,full
8,full__llama2-70b-4096,-28.3,5.8,-4.9,llama2-70b-4096,full
9,full__mixtral-8x7b-32768,6.0,-14.6,-16.3,mixtral-8x7b-32768,full
10,instruct__claude-3-haiku-20240307,27.3,35.7,34.7,claude-3-haiku-20240307,instruct
