In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from captioning.utils import IMG_DIR, ANSWER_DIR, CAPTION_DIR

In [4]:
assert IMG_DIR.exists(), "Image dir does not exist"
assert CAPTION_DIR.exists(), "Caption dir does not exist"

In [5]:
def process_image(file_path):
    try:
        image = Image.open(file_path)
        width, height = image.size
        return (os.path.basename(file_path), width, height)
    except Exception as e:
        print(f"Could not process image {file_path}: {e}")
        return None


def fetch_and_plot_image_resolutions_from_directory(directory_path):
    # Collect all image files
    image_files = [
        os.path.join(directory_path, filename)
        for filename in os.listdir(directory_path)
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff'))
    ]

    # Process images concurrently
    image_dimensions = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_image, file) for file in image_files]
        for future in tqdm(as_completed(futures), total=len(futures)):
            result = future.result()
            if result is not None:
                image_dimensions.append(result)

    return pd.DataFrame(image_dimensions, columns=['filename', 'width', 'height'])


df = fetch_and_plot_image_resolutions_from_directory(IMG_DIR)

100%|██████████| 14394/14394 [00:01<00:00, 10381.80it/s]


In [6]:
min_width = 512
min_height = 512

In [7]:
df = df[
    (df.width > min_width)
    & (df.height > min_height)
    # & (df.width < max_width)
    # & (df.height < max_height)
    ]
df

Unnamed: 0,filename,width,height
2,4329619655lPRD1BS.png,646,846
4,1915435520dUikaQn.png,550,592
9,08297449579aEmH4P.png,531,555
12,5945121180wscTtI.png,518,746
21,47124321206BYrgRb.png,601,833
...,...,...,...
14385,22146125291KIazr4.png,729,518
14387,0114533808G2w9j4p.png,787,645
14389,0899770210P0sUr8.png,1130,690
14391,5830887043vAcBTAd.jpg,1350,2025


In [8]:
df.shape

(2662, 3)

In [9]:
import json

entries = []
for file in (CAPTION_DIR / "_questions").glob("*.json"):
    with open(file, "r") as f:
        data = json.load(f)
        data['filename'] = file.stem
        # sort data keys to ensure consistent ordering
        data = {k: data[k] for k in sorted(data)}
        entries.append(data)

In [10]:
q_df = pd.DataFrame(entries)
q_df

Unnamed: 0,ActionsDescription,AgeGroup,BodyPosture,BustSize,ClothingCategory,FacialExpression,GeneralLocation,HairColor,HairStyle,ImageType,PersonOrientation,PhotoDescription,Physique,ShotContextFocus,SkinTone,WornAccessories,filename
0,The person in the image is standing in a bathr...,Middle-aged,Exposing,Small,Topless,No idea,Bathroom,Purple,Wavy,Candid,Front,A woman with blonde hair and a pink shirt is s...,No idea,No,Light,No,3029816710K5BJaax
1,The person in the image is lying on a bed with...,Teenager,Exposing,Small,No,No idea,Bedroom,Brown,No idea,Erotic,No,A woman is lying on a bed with her legs spread...,No idea,Butt,Light,No,374795718T8lB1kR
2,The person in the image is lying naked on a be...,Teenager,No idea,Small,No,No idea,Indoors,Red,Straight,Erotic,Front,"A woman with red hair is lying on a bed, weari...",No idea,Face,Light,Necklace,0529707159LKOebFg
3,The person in the image is posing for a pictur...,Middle-aged,Sexy pose,Large,No,No idea,Indoors,Blond,Wavy,Close-up,Overhead,A woman with blonde hair wearing a black outfi...,No idea,Chest,Light,No,4229303308DBFLF5F
4,The person in the image is posing in a very re...,Young adult,Sexy pose,Small,No,Neutral,Indoors,Blond,No idea,Portrait,Overhead,A woman is sitting in a chair and leaning over...,No idea,Butt,Light,No,3912861516kSCw9fb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2379,The person in the image is posing for a pictur...,Young adult,Sexy pose,Small,No,Smiling,Indoors,Brown,Straight,Portrait,Front,A woman with long brown hair is standing in a ...,No idea,Face,Light,No,3322444759BDISzD0
2380,The person in the image is taking a selfie whi...,Teenager,Sexy pose,Small,No,No idea,Indoors,Red,No idea,Candid,Front,A woman with short hair is standing in a pool ...,No idea,Face,Light,Earrings,00297823097yMOxAx
2381,The person in the image is taking a picture of...,Middle-aged,Exposing,Small,Tops,Neutral,Indoors,No idea,No idea,Close-up,Front,A person is taking a picture of their chest.,No idea,Chest,Light,No,4729808511zgd4xQL
2382,"The person in the image is standing naked, hol...",Middle-aged,Naked,Small,No,Neutral,Indoors,Brown,No idea,Portrait,Front,"A woman is standing in a room, holding a towel...",No idea,Chest,Light,Necklace,1422661128tTWPf4W


In [11]:
df.filename = df.filename.apply(lambda x: x.split(".")[0].lower())
df

Unnamed: 0,filename,width,height
2,4329619655lprd1bs,646,846
4,1915435520duikaqn,550,592
9,08297449579aemh4p,531,555
12,5945121180wsctti,518,746
21,47124321206byrgrb,601,833
...,...,...,...
14385,22146125291kiazr4,729,518
14387,0114533808g2w9j4p,787,645
14389,0899770210p0sur8,1130,690
14391,5830887043vacbtad,1350,2025


In [12]:
df = df.merge(q_df, on="filename")
df

Unnamed: 0,filename,width,height,ActionsDescription,AgeGroup,BodyPosture,BustSize,ClothingCategory,FacialExpression,GeneralLocation,HairColor,HairStyle,ImageType,PersonOrientation,PhotoDescription,Physique,ShotContextFocus,SkinTone,WornAccessories
0,4029786353dk9kis0,601,832,The person in the image is smiling and posing ...,Young adult,Sexy pose,Large,No,Smiling,Indoors,Blonde,Straight,Portrait,Front,A woman with blonde hair wearing a red lace to...,No idea,Face,Light,No
1,0914579451hkuakm4,523,565,The person in the image is sitting on a couch ...,No idea,Sexy pose,Small,No,Frowning,Indoors,No idea,No,Erotic,Front,A woman is sitting on a couch with her legs cr...,No idea,Chest,Light,No
2,30144854534iyburx,610,518,The person in the image is laying down and smi...,Young adult,Lying down,Small,Topless,Smiling,Indoors,Brown,Straight,Portrait,Front,A woman with a leopard print bikini top and fi...,No idea,Face,Light,Earrings
3,3614528749ny1fved,699,900,The person in the image is taking a selfie or ...,Teenager,No idea,Small,Tops,Neutral,Indoors,Brown,Straight,Close-up,Front,"A woman with long hair is lying on a bed, look...",No idea,Face,Light,No
4,2429736124977zll7,696,931,The person in the image is laying down and sho...,Teenager,Exposing,Small,No,No idea,Indoors,No idea,No information,Close-up,Front,A woman is lying on a bed with her body covere...,No idea,Chest,Light,No
5,072448324po6cp8y,626,720,"The person in the image is taking a selfie, ca...",Teenager,No,Small,No,Neutral,Indoors,No idea,No,Close-up,Front,A person wearing a black and white lace dress ...,No idea,Butt,Light,No
6,5112640207obwtqm9,795,744,The person in the image is bending over and lo...,Teenager,No,Small,Toeless,Neutral,Indoors,No idea,No,Close-up,No,The image shows a close-up view of a person's ...,No,Butt,Light,No
7,5623322328i7xvd0z,604,831,"The person in the image is taking a selfie, li...",Middle-aged,Sexy pose,Small,Topless,No idea,Bathroom,No idea,No idea,Candid,Front,A woman wearing a black hat is taking a selfie...,No idea,Face,Light,Hat
8,162962381370bxi7i,567,906,The person in the image is posing for a pictur...,Young adult,Sexy pose,Small,No,Neutral,Bathroom,No idea,No idea,Portrait,Front,A woman wearing a black dress is standing in f...,No idea,Legs,Light,No
9,4948463264nxv28w,525,682,The person in the image is posing for a pictur...,No idea,Sexy pose,Small,No,Neutral,Indoors,No idea,No,Close-up,Front,A person is standing in a room with their body...,No idea,Chest,Light,No


In [13]:
df["status"] = ['to_review'] * df.shape[0]
df

Unnamed: 0,filename,width,height,ActionsDescription,AgeGroup,BodyPosture,BustSize,ClothingCategory,FacialExpression,GeneralLocation,HairColor,HairStyle,ImageType,PersonOrientation,PhotoDescription,Physique,ShotContextFocus,SkinTone,WornAccessories,status
0,4029786353dk9kis0,601,832,The person in the image is smiling and posing ...,Young adult,Sexy pose,Large,No,Smiling,Indoors,Blonde,Straight,Portrait,Front,A woman with blonde hair wearing a red lace to...,No idea,Face,Light,No,to_review
1,0914579451hkuakm4,523,565,The person in the image is sitting on a couch ...,No idea,Sexy pose,Small,No,Frowning,Indoors,No idea,No,Erotic,Front,A woman is sitting on a couch with her legs cr...,No idea,Chest,Light,No,to_review
2,30144854534iyburx,610,518,The person in the image is laying down and smi...,Young adult,Lying down,Small,Topless,Smiling,Indoors,Brown,Straight,Portrait,Front,A woman with a leopard print bikini top and fi...,No idea,Face,Light,Earrings,to_review
3,3614528749ny1fved,699,900,The person in the image is taking a selfie or ...,Teenager,No idea,Small,Tops,Neutral,Indoors,Brown,Straight,Close-up,Front,"A woman with long hair is lying on a bed, look...",No idea,Face,Light,No,to_review
4,2429736124977zll7,696,931,The person in the image is laying down and sho...,Teenager,Exposing,Small,No,No idea,Indoors,No idea,No information,Close-up,Front,A woman is lying on a bed with her body covere...,No idea,Chest,Light,No,to_review
5,072448324po6cp8y,626,720,"The person in the image is taking a selfie, ca...",Teenager,No,Small,No,Neutral,Indoors,No idea,No,Close-up,Front,A person wearing a black and white lace dress ...,No idea,Butt,Light,No,to_review
6,5112640207obwtqm9,795,744,The person in the image is bending over and lo...,Teenager,No,Small,Toeless,Neutral,Indoors,No idea,No,Close-up,No,The image shows a close-up view of a person's ...,No,Butt,Light,No,to_review
7,5623322328i7xvd0z,604,831,"The person in the image is taking a selfie, li...",Middle-aged,Sexy pose,Small,Topless,No idea,Bathroom,No idea,No idea,Candid,Front,A woman wearing a black hat is taking a selfie...,No idea,Face,Light,Hat,to_review
8,162962381370bxi7i,567,906,The person in the image is posing for a pictur...,Young adult,Sexy pose,Small,No,Neutral,Bathroom,No idea,No idea,Portrait,Front,A woman wearing a black dress is standing in f...,No idea,Legs,Light,No,to_review
9,4948463264nxv28w,525,682,The person in the image is posing for a pictur...,No idea,Sexy pose,Small,No,Neutral,Indoors,No idea,No,Close-up,Front,A person is standing in a room with their body...,No idea,Chest,Light,No,to_review


In [14]:
df.to_csv("image_data.csv", index=False)

In [21]:
import random

df.iloc[random.randint(0, df.shape[0])]

filename                                              2324159147x143256
width                                                               590
height                                                              590
ActionsDescription         The person in the image is sitting on a bed.
AgeGroup                                                    Middle-aged
BodyPosture                                                   Sexy pose
BustSize                                                          Large
ClothingCategory                                                     No
FacialExpression                                                No idea
GeneralLocation                                                 Indoors
HairColor                                                        Blonde
HairStyle                                                       No idea
ImageType                                                        Erotic
PersonOrientation                                              O

In [20]:
# get row with filename == 1129690109aNNYkDO
# df[df.filename == "1129690109aNNYkDO".lower()]

Unnamed: 0,filename,width,height,ActionsDescription,AgeGroup,BodyPosture,BustSize,ClothingCategory,FacialExpression,GeneralLocation,HairColor,HairStyle,ImageType,PersonOrientation,PhotoDescription,Physique,ShotContextFocus,SkinTone,WornAccessories,status
