# Download Smiling Images Notebook

In [1]:
%pip install transformers datasets torchvision fsspec pyarrow pillow ipywidgets

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import CLIPProcessor, CLIPModel
import torch
import fsspec
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

# Load CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()

# Load dataset
fs = fsspec.filesystem("hf")
with fs.open("datasets/OpenFace-CQUPT/FaceCaption-15M/FaceCaption-v2.parquet") as f:
    pq_file = pq.ParquetFile(f)
    table = pq_file.read_row_group(0, columns=['caption','url'])
    df = table.to_pandas()

# Filter nulls and valid captions
df = df[df['caption'].notnull() & df['url'].notnull()]
df = df[df['caption'].apply(lambda x: isinstance(x, (list, np.ndarray)) and len(x)>0)]
df['caption_text'] = df['caption'].apply(lambda x: str(x[0]).strip().lower())
df['url'] = df['url'].apply(lambda x: x.split(',')[0].strip())

# Filter for 'smil' in caption
df_smile = df[df['caption_text'].str.contains('smil', na=False)]
print(f"Found {len(df_smile)} entries containing 'smil' in caption.")

2025-05-22 20:23:26.199190: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Found 343118 entries containing 'smil' in caption.


In [None]:
from ipywidgets import ToggleButton, Output
from IPython.display import display
import os
import json
import numpy as np
from PIL import Image
from io import BytesIO
import requests

# Constants and directories
DOWNLOAD_COUNT = 2000
images_dir = "smile_images"
metadata_dir = "smile_metadata"
os.makedirs(images_dir, exist_ok=True)
os.makedirs(metadata_dir, exist_ok=True)

download_output = Output()

DOWNLOAD_DATA = True
if DOWNLOAD_DATA:
    download_output.clear_output()
    with download_output:
        saved = 0
        for idx, row in enumerate(df_smile.itertuples(index=False), start=1):
            if saved >= DOWNLOAD_COUNT:
                break
            try:
                resp = requests.get(row.url, timeout=5)
                resp.raise_for_status()
                img = Image.open(BytesIO(resp.content)).convert("RGB")
            except Exception:
                continue
            # Save image
            img_filename = f"smil_{idx:05d}.jpg"
            img.save(os.path.join(images_dir, img_filename))
            # Prepare metadata
            raw_caption = row.caption if isinstance(row.caption, list) else (
                row.caption.tolist() if isinstance(row.caption, np.ndarray) else [row.caption]
            )
            metadata = {
                "url": row.url,
                "caption_text": row.caption_text,
                "raw_caption": raw_caption
            }
            # Save metadata
            meta_filename = f"smil_{idx:05d}.json"
            with open(os.path.join(metadata_dir, meta_filename), "w", encoding="utf-8") as f:
                json.dump(metadata, f, ensure_ascii=False, indent=2)
            saved += 1
        print(f"Saved {saved} images and metadata to '{images_dir}' and '{metadata_dir}'")