In [1]:
%pip install praw pandas
%pip install easyocr

Note: you may need to restart the kernel to use updated packages.




Note: you may need to restart the kernel to use updated packages.




In [2]:
import os
import pandas as pd
import requests
import praw
import time
from tqdm import tqdm
import easyocr

# Do OCR

In [5]:
def perform_ocr(image_folder, csv_file):
    # ✅ Initialize EasyOCR reader
    reader = easyocr.Reader(['en'])  # Load English model
    df = pd.read_csv(csv_file, encoding="ISO-8859-1")
    df["Extracted Text"] = ""

    # ✅ List to store rows to delete
    rows_to_delete = []

    # ✅ Iterate through all rows with progress bar
    for index, row in tqdm(list(df.iterrows()), total=len(df), desc="Performing OCR"):
        # ✅ Check for correct column name
        image_name = str(row.get("File Name", ""))
        image_path = os.path.join(image_folder, image_name)

        # ✅ Check if file exists and is accessible
        if os.path.isfile(image_path):
            try:
                # ✅ Apply EasyOCR
                results = reader.readtext(image_path)
                if results:
                    extracted_text = " ".join([result[1] for result in results])
                    df.at[index, "Extracted Text"] = extracted_text.strip()
                else:
                    # ✅ No text extracted, mark for deletion
                    rows_to_delete.append(index)
                    os.remove(image_path)
                    print(f"❌ No text found. Deleted: {image_name}")
            except Exception as e:
                # ✅ Error in OCR, delete image and row
                rows_to_delete.append(index)
                os.remove(image_path)
                print(f"❌ Error processing {image_name}: {e}")
        else:
            # ✅ File not found, mark row for deletion
            rows_to_delete.append(index)
            print(f"❌ File not found: {image_name}")

    # ✅ Drop invalid rows and reset index
    df.drop(rows_to_delete, inplace=True)
    df.reset_index(drop=True, inplace=True)

    # ✅ Rename files to maintain correct numbering
    for new_index, row in tqdm(df.iterrows(), total=len(df), desc="Renaming images"):
        old_name = str(row.get("File Name", ""))
        file_extension = os.path.splitext(old_name)[1]
        new_name = f"{new_index + 1}_{old_name.split('_', 1)[-1]}"
        old_path = os.path.join(image_folder, old_name)
        new_path = os.path.join(image_folder, new_name)

        if os.path.isfile(old_path):
            os.rename(old_path, new_path)
            df.at[new_index, "File Name"] = new_name

    df.to_csv(csv_file, index=False)
    print(f"✅ Updated CSV saved with extracted text as: {csv_file}")
    return df

In [None]:
filename_to_save ="metadata.csv"
images_output_folder = "reddit_meme_data\images"

In [14]:
data_df = perform_ocr(images_output_folder,filename_to_save)

Fetching posts:   0%|          | 0/500 [00:00<?, ?it/s]

Fetching posts: 100%|██████████| 500/500 [22:16<00:00,  2.67s/it]


Data saved to ProgrammerHumor.csv


Downloading images: 100%|██████████| 500/500 [02:16<00:00,  3.67it/s]
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.



Total invalid/failed downloads: 23
Row 34 - URL: https://i.redd.it/8k5jsyqlbhpe1.gif deleted.
Row 46 - URL: https://www.reddit.com/gallery/1ject17 deleted.
Row 102 - URL: https://i.imgur.com/BoXRNqO.jpeg deleted.
Row 117 - URL: https://i.redd.it/xpstbze5b5pe1.gif deleted.
Row 155 - URL: https://imgur.com/YRkb2P3 deleted.
Row 165 - URL: https://i.redd.it/wywk3m74aooe1.gif deleted.
Row 175 - URL: https://github.com/giacomo-b/rust-stakeholder deleted.
Row 178 - URL: https://imgur.com/a/aXzlNxZ deleted.
Row 215 - URL: https://i.redd.it/832i90ehpgoe1.gif deleted.
Row 237 - URL: https://www.reddit.com/gallery/1ja8ukv deleted.
Row 239 - URL: https://youtube.com/shorts/egUwZAyY0sk?feature=share deleted.
Row 249 - URL: https://i.redd.it/p1i5pqf782oe1.gif deleted.
Row 262 - URL: https://youtu.be/kqa_EiQMN3Y deleted.
Row 303 - URL: https://www.reddit.com/gallery/1j7s4uj deleted.
Row 329 - URL: https://www.reddit.com/gallery/1j71w40 deleted.
Row 332 - URL: https://imgur.com/a/XiAnXQl deleted.
Row

Performing OCR:  29%|██▉       | 139/477 [18:28<36:45,  6.52s/it]  

❌ No text found. Deleted: 139_joekowski_ProgrammerHumor.png


Performing OCR:  49%|████▉     | 233/477 [27:27<19:00,  4.68s/it]  

❌ No text found. Deleted: 233_TallStrike6226_ProgrammerHumor.jpeg


Performing OCR:  57%|█████▋    | 270/477 [30:06<08:16,  2.40s/it]

❌ No text found. Deleted: 270_invisiblebeing_ProgrammerHumor.png


Performing OCR:  65%|██████▍   | 308/477 [32:03<09:20,  3.32s/it]

❌ No text found. Deleted: 308_mcnello_ProgrammerHumor.jpeg


Performing OCR: 100%|██████████| 477/477 [42:00<00:00,  5.29s/it]
Renaming images: 100%|██████████| 473/473 [00:00<00:00, 1289.69it/s]


✅ Updated CSV saved with extracted text as: ProgrammerHumor.csv
