In [1]:
import spacy
from meme_entity_detection.dataset.data_module import DataModule
from pathlib import Path
from IPython.display import clear_output
import pandas as pd
from spacy.cli import download
import base64
from openai import OpenAI
from tqdm import tqdm
import json
import ast

# Download the large English model if it is not already installed
try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    download("en_core_web_lg")
    
    
nlp = spacy.load("en_core_web_lg")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from pathlib import Path

import IPython.display
import pandas as pd

import meme_entity_detection.dataset

data_dir = Path("../../../data/HVVMemes/")
dataset = meme_entity_detection.dataset.DataModule(data_dir, balance_train_dataset=False)
dataset.setup("_")
IPython.display.clear_output(wait=False)

train_df = dataset.train_dataset.data_df
validation_df = dataset.validation_dataset.data_df
test_df = dataset.test_dataset.data_df

# Concatenating all dataframes to get a combined view
combined_df = pd.concat([
    train_df.assign(dataset='train'),
    validation_df.assign(dataset='validation'),
    test_df.assign(dataset='test')
])[["sentence", "original", "dataset", "image", "word"]].drop_duplicates()

combined_df["image_path"] = str(data_dir) + "/images/" + combined_df["image"]

entities_in_image = combined_df.groupby("image")["word"].apply(list).reset_index()
combined_df = combined_df[["sentence", "original", "dataset", "image", "image_path"]].drop_duplicates()
combined_df = combined_df.merge(entities_in_image, on="image")
combined_df

Unnamed: 0,sentence,original,dataset,image,image_path,word
0,realtor: can you see yourself being quarantine...,REALTOR: CAN YOU SEE\nYOURSELF BEING QUARANTIN...,train,covid_memes_4999.png,../../../data/HVVMemes/images/covid_memes_4999...,"[realtor, quarantine, quaranitned]"
1,when you connect your charger on 1% chárge but...,When you connect your charger on\n1% chárge bu...,train,covid_memes_4355.png,../../../data/HVVMemes/images/covid_memes_4355...,"[charger, phone, cats]"
2,i tyt tyt reacts: first presidential debate ne...,I TYT TYT REACTS: FIRST PRESIDENTIAL DEBATE\nN...,train,memes_4435.png,../../../data/HVVMemes/images/memes_4435.png,"[presidential debate, joe biden, donald trump,..."
3,the democratic party is socialist. i am the de...,The Democratic Party is socialist.\nI am the D...,train,memes_1692.png,../../../data/HVVMemes/images/memes_1692.png,"[democratic party, socialist, joe biden, donal..."
4,every disaster movie starts with the governmen...,EVERY\nDISASTER\nMOVIE STARTS\nWITH THE\nGOVER...,train,covid_memes_1094.png,../../../data/HVVMemes/images/covid_memes_1094...,"[government, scientist, people]"
...,...,...,...,...,...,...
6711,600 recorded reguests for additional security ...,600 recorded reguests for\nadditional security...,test,memes_1585.png,../../../data/HVVMemes/images/memes_1585.png,"[hillary clinton, amb chris stevens]"
6712,i am the democrat party! if you say so joe!,I AM THE DEMOCRAT PARTY!\nIF YOU SAY SO JOE!\n,test,memes_1312.png,../../../data/HVVMemes/images/memes_1312.png,"[bernie sanders, kamala harris, joe biden, dem..."
6713,donald trump's incompetence is becoming all to...,Donald Trump's incompetence is\nbecoming all t...,test,covid_memes_5577.png,../../../data/HVVMemes/images/covid_memes_5577...,"[donald trump, us, covid19, mike pence]"
6714,rammys oflammys 12h hini vs covid-19 cases - c...,Rammys Oflammys 12h\nHINI Vs Covid-19\nCases -...,test,covid_memes_5583.png,../../../data/HVVMemes/images/covid_memes_5583...,"[h1n1, h1n1 vs covid19, covid19]"


# ChatGPT API Calls

In [6]:
api_key = input("Please insert your api key:")
clear_output()

In [8]:
def encode_image(image_path: list):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')
  
def build_prompt(entities):
  entities_dict = {entity: "fill in the class here" for entity in entities}


  json_format = """{"OCR": "OCR OF THE IMAGE",
  "IMAGE_DESCRIPTION: "DESCRIPTION_OF_IMAGE_CONTENTS",
  "CLASSIFICATION": """+str(entities_dict).replace('",', '",\n') + """
  }"""
                              
  text = f"""Process the image and fill the following json-object in the follwing schema. 
  You should OCR, image description and classification. Do net return anything else than the json.
  Do not change the format of the json. If you can not fill in the json, return a None inside of the json:
                      
  For the classification note that each entitly must exactly have one class.
  Do not change the name of the entities, even if they are misspelled.

  This task emphasizes detecting which entities are glorified, vilified or victimized, within a meme. Assuming the frame of reference as the meme author’s perspective, the objective is to classify for a given pair of a meme and an entity, whether the entity is being referenced as Hero vs. Villain vs. Victim vs. Other, within that meme.
  Definition of the entity classes:

      Hero: The entity is presented in a positive light. Glorified for their actions conveyed via the meme or gathered from background context
      Villain: The entity is portrayed negatively, e.g., in an association with adverse traits like wickedness, cruelty, hypocrisy, etc.
      Victim: The entity is portrayed as suffering the negative impact of someone else’s actions or conveyed implicitly within the meme.
      Other: The entity is not a hero, a villain, or a victim.   
              
  The classes you can choose from are:
  "other", "villain", "victim", "hero"
                      
                      
  Here is the json template:
                      
  {json_format}
                              
                      
  """
  
  return text
  

In [9]:
def send_api_call(image: base64, entities: list):
  
  client = OpenAI(api_key=api_key)

  response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": build_prompt(entities)
          },
          {
            "type": "image_url",
            "image_url": {
              "url": f"data:image/jpeg;base64,{image}"
            }
          
          },
        ],
      }
    ],
    max_tokens=800,
  )
      
  return response.choices[0].message.content

In [11]:
json_path = data_dir.parent / f'ocr_data_openai.json'

ocr_data = json.loads(open(json_path, "r").read())

test_only_df = combined_df[combined_df["dataset"] == "test"]

for sentence, entities, image, image_path in tqdm(zip(test_only_df["sentence"].tolist(), test_only_df["word"], 
                                            test_only_df["image"].tolist(), test_only_df["image_path"].tolist())):  #
    
#    if not image in ocr_data.keys(): 
        
        encoded_image = encode_image(image_path)
        try:
            content = send_api_call(encoded_image, entities)
            ocr_data[image] = content
        except:
            ocr_data[image] = "API Error"
            
            # Save the OCR data after each iteration
        with open(json_path, 'w') as f:
            json.dump(ocr_data, f)

718it [52:20,  4.37s/it]


In [6]:
ocr_data_cleaned = dict()
error_data = dict()

for k, v in tqdm(ocr_data.items()):
    try:
        keys_filtered_a = "{"+"{".join(ocr_data[k].split("{")[1:])
        keys_filtered_b = ("".join(keys_filtered_a.split("}")[:-1]) + "}" + "}").replace('"IMAGE_DESCRIPTION:', '"IMAGE_DESCRIPTION":')
        ocr_data_cleaned[k] = dict_obj = ast.literal_eval(keys_filtered_b)
    except:
        error_data[k] = v

100%|██████████| 6716/6716 [00:00<00:00, 68630.76it/s]


In [9]:
json_path_filtered = data_dir.parent / f'ocr_data_openai_preprocessed.json'

with open(json_path_filtered, 'w') as f:
    json.dump(ocr_data_cleaned, f)