# Prompt selection and testing

This notebook chooses the most appropriate prompt and prompt structure for the OCR correction. 

In [21]:
#import config  # Import your config.py file this contains you openai api key
import pandas as pd
import numpy as np
import os
from llm_comparison_toolkit import RateLimiter, get_response_openai, get_response_anthropic,  create_config_dict_func, compare_request_configurations, generate_model_configs
from evaluate import load
from evaluation_funcs import evaluate_correction_performance, evaluate_correction_performance_folders, get_metric_error_reduction
import seaborn as sns
import matplotlib.pyplot as plt
from helper_functions import files_to_df_func, evaluate_ner, calculate_entity_similarity, repeat_prompt_experiment
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import re


dev_data_folder = 'data/dev_data'
dev_transcripts = os.path.join(dev_data_folder, 'dev_data_transcript')
dev_raw_ocr_folder =  os.path.join(dev_data_folder,'dev_raw_ocr' )
dev_system_message_folder = os.path.join(dev_data_folder,'dev_system_message_variants' )

#load the dev and test sets for prompt development and selection
dev_data_df = pd.read_csv(os.path.join(dev_data_folder,'dev_data_raw.csv'))


#for saving data to be used in the analysis
if not os.path.exists('data/analysis'):
    os.makedirs('data/analysis')

# Evaluate performance on downstream tasks

In [23]:
ner_model = "Gladiator/microsoft-deberta-v3-large_ner_conll2003"#"dslim/bert-base-NER"

transcribed_data_set_df = files_to_df_func(transcribed_files)
raw_data_set_df = files_to_df_func(raw_folder)

raw_data_set_df = raw_data_set_df.loc[raw_data_set_df['file_name'].isin(transcribed_data_set_df['file_name'])]

tokenizer = AutoTokenizer.from_pretrained(ner_model)

model = AutoModelForTokenClassification.from_pretrained(ner_model)
def preprocess_text(text):
    # Example preprocessing: substitute multiple whitespaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text

perform_ner = pipeline("ner", model=model, tokenizer=tokenizer)


def perform_ner_on_text(text, ner_pipeline):
    # Preprocess the text first
    preprocessed_text = preprocess_text(text)
    # Then pass the preprocessed text to the NER pipeline
    return ner_pipeline(preprocessed_text)

transcribed_data_set_df['ner_results'] = transcribed_data_set_df['content'].apply(perform_ner_on_text, ner_pipeline=perform_ner)

raw_data_set_df['ner_results_raw'] = raw_data_set_df['content'].apply(perform_ner_on_text, ner_pipeline=perform_ner)

tokenizer_config.json:   0%|          | 0.00/400 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [24]:
#load on the transcribed data... this should be done within a loop as there are many models to test
LM_corrected_df = files_to_df_func(os.path.join(corrected_folder, 'claude_temp_claude-3-opus-20240229'))
temp_ner = transcribed_data_set_df.copy().merge(LM_corrected_df.loc[:,['content', 'file_name'] ].rename(columns={'content':'content_corrected'}), 
                                                on = 'file_name')

temp_ner['ner_results_corrected'] = temp_ner['content_corrected'].apply(perform_ner_on_text, ner_pipeline=perform_ner)

temp_ner['cosine_sim'] = temp_ner.apply(lambda row: calculate_entity_similarity(row['ner_results_corrected'],row['ner_results']), axis = 1)


print("Median cosine similarity:", temp_ner['cosine_sim'].median().round(2))

Median cosine similarity: 0.92


In [26]:
results, results_by_tag = evaluate_ner(temp_ner, 'ner_results', 'ner_results_corrected')

print(results)
print(results_by_tag)

KeyError: 'ner_results_corrected'

In [27]:
temp_ner = transcribed_data_set_df.copy().merge(raw_data_set_df.loc[:,['ner_results_raw', 'file_name'] ].rename(columns={'content':'content_corrected'}), 
                                                on = 'file_name')

temp_ner['cosine_sim'] = temp_ner.apply(lambda row: calculate_entity_similarity(row['ner_results_raw'],row['ner_results']), axis = 1)

print("Median cosine similarity:", temp_ner['cosine_sim'].median().round(2))

results, results_by_tag = evaluate_ner(temp_ner, 'ner_results', 'ner_results_raw')

print(results)
print(results_by_tag)

Median cosine similarity: 0.74
{'ent_type': {'correct': 97, 'incorrect': 63, 'partial': 0, 'missed': 588, 'spurious': 545, 'possible': 748, 'actual': 705, 'precision': 0.1375886524822695, 'recall': 0.12967914438502673, 'f1': 0.1335168616655196}, 'partial': {'correct': 16, 'incorrect': 0, 'partial': 144, 'missed': 588, 'spurious': 545, 'possible': 748, 'actual': 705, 'precision': 0.12482269503546099, 'recall': 0.11764705882352941, 'f1': 0.12112869924294564}, 'strict': {'correct': 14, 'incorrect': 146, 'partial': 0, 'missed': 588, 'spurious': 545, 'possible': 748, 'actual': 705, 'precision': 0.019858156028368795, 'recall': 0.01871657754010695, 'f1': 0.019270474879559532}, 'exact': {'correct': 16, 'incorrect': 144, 'partial': 0, 'missed': 588, 'spurious': 545, 'possible': 748, 'actual': 705, 'precision': 0.02269503546099291, 'recall': 0.0213903743315508, 'f1': 0.022023399862353753}}
{'B-LOC': {'ent_type': {'correct': 28, 'incorrect': 15, 'partial': 0, 'missed': 84, 'spurious': 78, 'possib

In [28]:
temp_ner['ner_results'][11]#['content'][10]

[{'entity': 'B-ORG',
  'score': 0.95968384,
  'index': 19,
  'word': '▁Review',
  'start': 89,
  'end': 96}]