# OCR Quality Check

The OCR quality of the data is not ideal, especially for low quality images. In this notebook we are evaluating the quality of the original OCR and the newly generated one by ChatGPT4o

In [1]:
import spacy
import spacy.cli

# Download the large English model if it is not already installed
try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    spacy.cli.download("en_core_web_lg")

nlp = spacy.load("en_core_web_lg")

In [52]:
def count_known_words(text: str, nlp):
    text = text.encode('utf-8', 'replace').decode()
    text = text.replace('\n', ' ').replace('\r', '').lower()
    # Process the text
    doc = nlp(text)

    # Count known words
    known_words = sum(1 for token in doc if not token.is_oov)
    total_words = len(doc)

    # unrecognized = [token for token in doc if token.is_oov]
    # if unrecognized != []:
    #     print(unrecognized)

    return known_words, total_words

In [39]:
from pathlib import Path

import IPython.display
import pandas as pd

import meme_entity_detection.dataset

data_dir = Path("../../../data/HVVMemes/")
dataset = meme_entity_detection.dataset.DataModule(data_dir, balance_train_dataset=False)
dataset.setup("_")
IPython.display.clear_output(wait=False)

train_df = dataset.train_dataset.data_df
validation_df = dataset.validation_dataset.data_df
test_df = dataset.test_dataset.data_df

# Concatenating all dataframes to get a combined view
combined_df = pd.concat([
    train_df.assign(dataset='train'),
    validation_df.assign(dataset='validation'),
    test_df.assign(dataset='test')
])[["sentence", "original", "dataset", "image", "word"]].drop_duplicates()

combined_df["image_path"] = str(data_dir) + "/images/" + combined_df["image"]

entities_in_image = combined_df.groupby("image")["word"].apply(list).reset_index()
combined_df = combined_df[["sentence", "original", "dataset", "image", "image_path"]].drop_duplicates()
combined_df = combined_df.merge(entities_in_image, on="image")

In [48]:
word_counter = combined_df["sentence"].apply(lambda x: count_known_words(x, nlp))
combined_df["correct words"] = word_counter.apply(lambda x: x[0])
combined_df["all words"] = word_counter.apply(lambda x: x[1])

correct_words_ratio = (combined_df["correct words"].sum() / combined_df["all words"].sum())
print(f"Correct words ratio of existing data:  {round(correct_words_ratio*100, 2)} %")

Correct words ratio of existing data:  92.87 %


In [53]:
df_openai = pd.read_json(data_dir.parent / f'ocr_data_openai_preprocessed.json').T
df_openai = df_openai.dropna()

word_counter = df_openai["OCR"].apply(lambda x: count_known_words(x, nlp))
df_openai["correct words"] = word_counter.apply(lambda x: x[0])
df_openai["all words"] = word_counter.apply(lambda x: x[1])

correct_words_ratio = (df_openai["correct words"].sum() / df_openai["all words"].sum())
print(f"Correct words ration of ocr preprocessed data:  {round(correct_words_ratio*100, 2)} %")

[tyt]
[ww3]
[translate.google.com]
[memegenerator.net]
[3/4/20]
[@cnn, cnn.it/2lqrrd7]
[noooooooooooo]
[dogald, memeshappen.com]
[paralkar, @vikramparalkar]
[politifake.org]
[desicomments.com]
[motherf***er]
[@cutememelover, covid20]
[@b_obamamemes]
[everyon]
[thefunnybeaver.com]
[@nypost, trib.al/ig4qdni, @greenpartyofde]
[3999.99]
[  ,   ,   ,   ]
[              ]
[@ratm,  ]
[slavs]
[@bidenthoughts]
[coronavirusnyc]
[http://www.crowdfunder.co.uk/green-party, -election, cornwall/]
[ ]
[wuricatoday.com]
[bipartisanreport.com]
[@jerrytravone]
[aggrievement, aggrievement, condaleeza]
[hannity]
[www.facebook.com/fkv32]
[ ,  ]
[momoa, nigmenog, grumbletonian, boredpanda.com]
[fb.com/jackandjones]
[ ]
[570k]
[boredpanda.com]
[tiffa]
[dnc]
[simran, staysafefromcoronavirus, mememe, @sansr]
[hagedorn, hagedorn]
[@culturedruffian]
[stonks]
[zzzzzzzzzz]
[-rep]
[ ,  ]
[greenparty]
[coronavaccine, russianvaccine]
[   ,  ]
[duush]
[@turntfortom]
[petchesky, @barry·nov]
[quarentine]
[like4like, bush