In [1]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
from pathlib import Path
import plotly.express as px
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
IMG_DIR = Path("/teamspace/uploads")
ANSWER_DIR = Path("./milfusion/captioning/_desc")

assert IMG_DIR.exists(), "Image dir does not exist"
assert ANSWER_DIR.exists(), "Answer dir does not exist"

In [3]:
def process_image(file_path):
    try:
        image = Image.open(file_path)
        width, height = image.size
        return (os.path.basename(file_path), width, height)
    except Exception as e:
        print(f"Could not process image {file_path}: {e}")
        return None

def fetch_and_plot_image_resolutions_from_directory(directory_path):
    # Collect all image files
    image_files = [
        os.path.join(directory_path, filename)
        for filename in os.listdir(directory_path)
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff'))
    ]
    
    # Process images concurrently
    image_dimensions = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_image, file) for file in image_files]
        for future in tqdm(as_completed(futures), total=len(futures)):
            result = future.result()
            if result is not None:
                image_dimensions.append(result)

    return pd.DataFrame(image_dimensions, columns=['filename', 'width', 'height'])

df = fetch_and_plot_image_resolutions_from_directory(IMG_DIR)

100%|██████████| 14394/14394 [01:14<00:00, 193.50it/s]


In [4]:
df

Unnamed: 0,filename,width,height
0,00107350022QUaL7b.png,450,450
1,00106719187KZXYr8.png,450,450
2,0010210809urGd1pR.png,450,450
3,0010671932WZK5Iw2.png,306,437
4,0010671922tFLiLN1.png,438,536
...,...,...,...
14389,597903354y5dqsXd.jpg,1019,638
14390,599072157RQG8vh1.png,410,584
14391,5990721494eFZqyd.png,450,450
14392,599072153Z9XJTJx.png,470,384


In [5]:
min_width = 640
min_height = 640
max_width = 1280
max_height = 1280

In [6]:
df = df[
    (df.width > min_width)
    & (df.height > min_height)
    # & (df.width < max_width)
    # & (df.height < max_height)
    ]
df

Unnamed: 0,filename,width,height
30,0014259142INGeJSv.png,665,871
64,0015960404ohJGoft.png,941,833
78,0016754111ciGik2d.png,669,679
82,0016754105udVReoy.png,664,705
94,0023011832WsdXnuL.png,694,934
...,...,...,...
14353,594567107TyTMef6.jpg,1000,1000
14354,594567104YinzFQn.jpg,801,801
14370,5955976444hHp9Tu.jpg,864,650
14386,597778215nI4eKoV.jpg,1584,970


In [7]:
df.shape

(840, 3)