# OCR postprocessing via openai Chat GPT3.5

In [None]:
import os
import numpy as np
import joblib
from tqdm import tqdm
from glob import glob
import os
import openai
from openai import OpenAI

from eval import cosinesim

with open('openai_api_key.txt', "r") as file:  
        openai.api_key = file.read()

os.environ["OPENAI_API_KEY"] = openai.api_key
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
)

data_dir = r''
gt_dir = os.path.join(data_dir, 'input_sample_groundtruth')
# pred_dir = os.path.join(data_dir, 'ocr_output_scalefactor4')
# pred_dir = os.path.join(data_dir, 'ocr_output_scalefactor6')

pred_dirs = [os.path.join(data_dir, 'ocr_output'), os.path.join(data_dir, 'ocr_output_scalefactor4'), os.path.join(data_dir, 'ocr_output_scalefactor6')]

gt_fns = glob(os.path.join(gt_dir, '*.txt'))

for pred_dir in pred_dirs:
    ocr_cosine_sim_list = []
    gpt_cosine_sim_list = []

    for i, gt_fn in tqdm(enumerate(gt_fns)):
        if i <= 200: continue
        pred_fn = os.path.join(pred_dir, os.path.basename(gt_fn))
        if os.path.join(pred_fn):
            with open(gt_fn, "r") as file:  
                gt_text = file.read()
            
            with open(pred_fn, "r") as file:  
                pred_text = file.read()

            
            # ocr_cosine_sim = cosinesim.get_cosine_sim_bert_single(gt_text, pred_text, model, tokenizer, device=device)

            # clean up using gpt 3.5
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": f"The following test is from ocr. There are some errors please fix them and return the text with no additional correspondence. Here are some examples: \n raw: As Lipperean was writing propaganda, Harald Lasewell was ondenaking empirical aralywes of propaganda. La fect, munca af the propagemila that Lasswell wees gaining us actually being writes by Lippresnn Hints [Regers, [P. \n cleaned: As Lippmann was writing propaganda, Harold Lasswell was undertaking empirical analyses of propaganda. In fact, much of the propaganda that Lasswell was examining was actually being written by Lippmann himself (Rogers, 1994). \n raw: Vaderstantieg of the nate peble:, and the wesesaity od pomp S ¢Eseragt, 1668, p22]. Encrnyn (1686) Limeel sede sine prepaeea tts ‘m inffueasing the engineering of concent. \n cleaned: Understanding of the public, and the necessity of attention-generating propaganda in influencing public opinion (Bernays, 1955, p.22). Bernays (1955) himself made a statement regarding his phrase, the engineering of consent public opinion (Bernays, 1955, p.22). \n raw: The theories developed by Lippmann, Lospwell, Eilul, and Bernays are importand jor a emuiree reasons. \n cleaned: The theories developed by Lippmann, Lasswell, Ellul, and Bernays are important for a number of reasons. \n now do that for this:  {pred_text}",
                    }
                ],
                model="gpt-3.5-turbo",  
            )

            assistant_reply = chat_completion.choices[0].message.content

            save_dir = os.path.join(data_dir, f'gptcleaned_{os.path.basename(os.path.dirname(pred_fn))}')
            if not os.path.exists(save_dir): os.mkdir(save_dir)
            save_path = os.path.join(save_dir, os.path.basename(pred_fn))
            with open(save_path, "w") as file:
                file.write(assistant_reply)
        
            gpt_cosine_sim = cosinesim.get_cosine_sim_bert_single(gt_text, assistant_reply, model, tokenizer, device=device)
            # ocr_cosine_sim_list.append(ocr_cosine_sim)
            gpt_cosine_sim_list.append(gpt_cosine_sim)

        if i == 400: break # do first two hundred

    # save similarity lists
    joblib.dump(ocr_cosine_sim_list, os.path.join(data_dir, f"ocr_cosine_sim_list_{os.path.basename(os.path.dirname(pred_fn))}.pkl"))
    joblib.dump(gpt_cosine_sim_list, os.path.join(data_dir, f"gpt_cosine_sim_list_{os.path.basename(os.path.dirname(pred_fn))}.pkl"))

    print(f'Raw had a sim score of {np.mean(ocr_cosine_sim_list)} while gpt parsed got {np.mean(gpt_cosine_sim_list)}')

        