In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from captioning.utils import IMG_DIR, ANSWER_DIR, CAPTION_DIR

In [20]:
assert IMG_DIR.exists(), "Image dir does not exist"
assert ANSWER_DIR.exists(), "Answer dir does not exist"
assert CAPTION_DIR.exists(), "Caption dir does not exist"

In [8]:
def process_image(file_path):
    try:
        image = Image.open(file_path)
        width, height = image.size
        return (os.path.basename(file_path), width, height)
    except Exception as e:
        print(f"Could not process image {file_path}: {e}")
        return None


def fetch_and_plot_image_resolutions_from_directory(directory_path):
    # Collect all image files
    image_files = [
        os.path.join(directory_path, filename)
        for filename in os.listdir(directory_path)
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff'))
    ]

    # Process images concurrently
    image_dimensions = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_image, file) for file in image_files]
        for future in tqdm(as_completed(futures), total=len(futures)):
            result = future.result()
            if result is not None:
                image_dimensions.append(result)

    return pd.DataFrame(image_dimensions, columns=['filename', 'width', 'height'])


df = fetch_and_plot_image_resolutions_from_directory(IMG_DIR)

100%|██████████| 14394/14394 [00:01<00:00, 11564.66it/s]


In [10]:
min_width = 640
min_height = 640
max_width = 1280
max_height = 1280

In [11]:
df = df[
    (df.width > min_width)
    & (df.height > min_height)
    # & (df.width < max_width)
    # & (df.height < max_height)
    ]
df

Unnamed: 0,filename,width,height
16,4329619655lPRD1BS.png,646,846
27,0924439125wz8N1ky.png,662,821
35,5929810948dhfR5xk.png,662,919
44,51177045em6OATO.PNG,723,835
64,5229809636qwDXsK6.png,656,869
...,...,...,...
14379,1029672908Hm0bn6k.png,716,804
14382,3613495718IBa8ko3.png,657,900
14383,5830887043vAcBTAd.jpg,1350,2025
14384,0899770210P0sUr8.png,1130,690


In [12]:
df.shape

(840, 3)

In [15]:
# Iterate over the filename column and parse "description" from ANSWER_DIR / filename. Save to a new column "description"
df_desc = df.copy(True)
df_desc['description'] = df['filename'].apply(lambda x: ((ANSWER_DIR / x).with_suffix(".txt")).read_text())

In [17]:
df_desc

Unnamed: 0,filename,width,height,description
16,4329619655lPRD1BS.png,646,846,The image shows a woman with blonde hair weari...
27,0924439125wz8N1ky.png,662,821,"In the image, a woman is sitting on the floor ..."
35,5929810948dhfR5xk.png,662,919,The image shows a woman wearing a red lace bra...
44,51177045em6OATO.PNG,723,835,The image shows a woman with blonde hair sitti...
64,5229809636qwDXsK6.png,656,869,"In the image, a woman is standing in a bathroo..."
...,...,...,...,...
14379,1029672908Hm0bn6k.png,716,804,"In the image, a woman is sitting on a black an..."
14382,3613495718IBa8ko3.png,657,900,"A woman stands in a cozy kitchen, her back to ..."
14383,5830887043vAcBTAd.jpg,1350,2025,The image shows a woman with vibrant red hair ...
14384,0899770210P0sUr8.png,1130,690,The image depicts a woman lying naked on a bed...


In [22]:
CAPTION_DIR

PosixPath('captioning')

In [21]:
df.to_csv(CAPTION_DIR / "filenames_with_descriptions.csv", index=False)