In [3]:
import spacy
from meme_entity_detection.dataset.data_module import DataModule
from pathlib import Path
from IPython.display import clear_output
import pandas as pd
from spacy.cli import download
import base64
from openai import OpenAI
from tqdm import tqdm
import json
import ast

# Download the large English model if it is not already installed
try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    download("en_core_web_lg")
    
    
nlp = spacy.load("en_core_web_lg")


  from .autonotebook import tqdm as notebook_tqdm


Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
from pathlib import Path

import IPython.display
import pandas as pd

import meme_entity_detection.dataset

data_dir = Path("../../../data/HVVMemes/")
dataset = meme_entity_detection.dataset.DataModule(data_dir, balance_train_dataset=False)
dataset.setup("_")
IPython.display.clear_output(wait=False)

train_df = dataset.train_dataset.data_df
validation_df = dataset.validation_dataset.data_df
test_df = dataset.test_dataset.data_df

# Concatenating all dataframes to get a combined view
combined_df = pd.concat([
    train_df.assign(dataset='train'),
    validation_df.assign(dataset='validation'),
    test_df.assign(dataset='test')
])[["sentence", "original", "dataset", "image", "word"]].drop_duplicates()

combined_df["image_path"] = str(data_dir) + "/images/" + combined_df["image"]

entities_in_image = combined_df.groupby("image")["word"].apply(list).reset_index()
combined_df = combined_df[["sentence", "original", "dataset", "image", "image_path"]].drop_duplicates()
combined_df = combined_df.merge(entities_in_image, on="image")
combined_df

# Check for OCR Quality:

In [6]:
word_counter = combined_df["sentence"].apply(count_known_words)
combined_df["correct words"] = word_counter.apply(lambda x: x[0])
combined_df["all words"] = word_counter.apply(lambda x: x[1])

correct_words_ratio = (combined_df["correct words"].sum() / combined_df["all words"].sum())
print(f"Correct words ration of existing data:  {round(correct_words_ratio*100, 2)} %")

Correct words ration of existing data:  92.87 %


# ChatGPT API Calls

In [7]:
api_key = input("Please insert your api key:")
clear_output()

In [8]:
def encode_image(image_path: list):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')
  
def build_prompt(entities):
  entities_dict = {entity: "fill in the class here" for entity in entities}


  json_format = """{"OCR": "OCR OF THE IMAGE",
  "IMAGE_DESCRIPTION: "DESCRIPTION_OF_IMAGE_CONTENTS",
  "CLASSIFICATION": """+str(entities_dict).replace('",', '",\n') + """
  }"""
                              
  text = f"""Process the image and fill the following json-object in the follwing schema. 
  You should OCR, image description and classification. Do net return anything else than the json.
  Do not change the format of the json. If you can not fill in the json, return a None inside of the json:
                      
  For the classification note that each entitly must exactly have one class.
  Do not change the name of the entities, even if they are misspelled.
                      
  The classes you can choose from are:
  "other", "villian", "victim", "hero"
                      
                      
  Here is the json template:
                      
  {json_format}
                              
                      
  """
  
  return text
  

In [77]:
def send_api_call(image: base64, entities: list):
  
  client = OpenAI(api_key=api_key)

  response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": build_prompt(entities)
          },
          {
            "type": "image_url",
            "image_url": {
              "url": f"data:image/jpeg;base64,{image}"
            }
          
          },
        ],
      }
    ],
    max_tokens=800,
  )
      
  return response.choices[0].message.content

In [23]:
json_path = data_dir.parent / f'ocr_data_openai.json'

ocr_data = json.loads(open(json_path, "r").read())

for sentence, entities, image, image_path in tqdm(zip(combined_df["sentence"].tolist(), combined_df["word"], 
                                            combined_df["image"].tolist(), combined_df["image_path"].tolist())):  #
    
    if not image in ocr_data.keys(): 
        
        encoded_image = encode_image(image_path)
        try:
            content = send_api_call(encoded_image, entities)
            ocr_data[image] = content
        except:
            ocr_data[image] = "API Error"
            
            # Save the OCR data after each iteration
        with open(json_path, 'w') as f:
            json.dump(ocr_data, f)

6716it [7:26:26,  3.99s/it]


In [69]:
ocr_data_cleaned = dict()
error_data = dict()
for k, v in tqdm(ocr_data.items()):
    try:
        keys_filtered_a = "{"+"{".join(ocr_data[k].split("{")[1:])
        keys_filtered_b = ("".join(keys_filtered_a.split("}")[:-1]) + "}" + "}").replace('"IMAGE_DESCRIPTION:', '"IMAGE_DESCRIPTION":')
        ocr_data_cleaned[k] = dict_obj = ast.literal_eval(keys_filtered_b)
    except:
        error_data[k] = v

100%|██████████| 6716/6716 [00:00<00:00, 64601.15it/s]


In [78]:
error_df = combined_df[combined_df["image"].isin(error_data)]

for sentence, entities, image, image_path in tqdm(zip(error_df["sentence"].tolist(), error_df["word"], 
                                            error_df["image"].tolist(), error_df["image_path"].tolist())):  #
    
    encoded_image = encode_image(image_path)
    try:
        content = send_api_call(encoded_image, entities)
        ocr_data[image] = content
    except:
        ocr_data[image] = "API Error"
            
            # Save the OCR data after each iteration
    with open(json_path, 'w') as f:
        json.dump(ocr_data, f)

80it [05:44,  4.30s/it]


In [79]:
ocr_data_cleaned = dict()
error_data = dict()

for k, v in tqdm(ocr_data.items()):
    try:
        keys_filtered_a = "{"+"{".join(ocr_data[k].split("{")[1:])
        keys_filtered_b = ("".join(keys_filtered_a.split("}")[:-1]) + "}" + "}").replace('"IMAGE_DESCRIPTION:', '"IMAGE_DESCRIPTION":')
        ocr_data_cleaned[k] = dict_obj = ast.literal_eval(keys_filtered_b)
    except:
        error_data[k] = v

100%|██████████| 6716/6716 [00:00<00:00, 58192.73it/s]


In [87]:
json_path_filtered = data_dir.parent / f'ocr_data_openai_preprocessed.json'

with open(json_path_filtered, 'w') as f:
    json.dump(ocr_data_cleaned, f)