In [1]:
import json
import os
from tqdm import tqdm

import torchvision
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = transformers.LlamaTokenizerFast.from_pretrained(
    'mistralai/Mixtral-8x7B-v0.1'
)

In [3]:
def load_datafile(root, data_name, images_subdir):
    image_paths = []
    descriptions = []

    data_json = json.load(open(os.path.join(root, data_name), "r", encoding="utf-8"))
    dataset_raw = {
        app["appId"]: {"image_paths": [], "description": app["description"]}
        for app in data_json
    }

    # Add image paths
    for image in os.listdir(os.path.join(root, images_subdir)):
        app_id = "_".join(image.split("_")[:-1])
        if app_id in dataset_raw:
            dataset_raw[app_id]["image_paths"].append(os.path.join(root, images_subdir, image))

    # Extract image paths and descriptions into separate lists
    for app_id, app in dataset_raw.items():
        if len(app["image_paths"]) > 0:
            image_paths.append(app["image_paths"])
            descriptions.append(app["description"])

    return image_paths, descriptions


def print_description_stats(description_lengths):
    print(f"Max description length: {max(description_lengths):,}")
    print(f"Min description length: {min(description_lengths)}")
    print(f"Mean description length: {sum(description_lengths) / len(description_lengths):.2f}")
    print(f"25th percentile description length: {sorted(description_lengths)[int(len(description_lengths) * 0.25)]}")
    print(f"50th percentile description length: {sorted(description_lengths)[int(len(description_lengths) * 0.5)]}")
    print(f"75th percentile description length: {sorted(description_lengths)[int(len(description_lengths) * 0.75)]}")
    print(f"90th percentile description length: {sorted(description_lengths)[int(len(description_lengths) * 0.9)]}")


def print_image_stats(image_paths):
    print(f"Number of images: {sum(len(paths) for paths in image_paths):,}")
    print(f"Average number of images per app: {sum(len(paths) for paths in image_paths) / len(image_paths):.02f}")
    print(f"25th percentile number of images per app: {sorted([len(paths) for paths in image_paths])[int(len(image_paths) * 0.25)]}")
    print(f"50th percentile number of images per app: {sorted([len(paths) for paths in image_paths])[int(len(image_paths) * 0.5)]}")
    print(f"75th percentile number of images per app: {sorted([len(paths) for paths in image_paths])[int(len(image_paths) * 0.75)]}")
    print(f"90th percentile number of images per app: {sorted([len(paths) for paths in image_paths])[int(len(image_paths) * 0.9)]}")

    print("(25, 50, 75)th percentiles and mean resolution are shown below")


def print_resolution_stats(image_paths):
    x_resolutions = []
    y_resolutions = []

    bar = tqdm(image_paths, desc="Processing images")
    for i, paths in enumerate(bar):
        for path in paths:
            image = torchvision.io.read_image(
                path,
                torchvision.io.ImageReadMode.RGB
            )
            x_resolutions.append(image.shape[2])
            y_resolutions.append(image.shape[1])

        x_resolutions = sorted(x_resolutions)
        y_resolutions = sorted(y_resolutions)

        bar.set_description(f"X: {x_resolutions[int(len(x_resolutions) * 0.25)], x_resolutions[int(len(x_resolutions) * 0.5)], x_resolutions[int(len(x_resolutions) * 0.75)]} {sum(x_resolutions) / len(x_resolutions):.00f} | Y: {y_resolutions[int(len(y_resolutions) * 0.25)], y_resolutions[int(len(y_resolutions) * 0.5)], y_resolutions[int(len(y_resolutions) * 0.75)]} {sum(y_resolutions) / len(y_resolutions):.00f}")


def get_stats(data_name):
    print(f"Stats for {data_name}")

    image_paths, descriptions = load_datafile(".", data_name, "images")

    print(f"Number of apps: {len(image_paths):,}")
    print('-' * 80)

    description_lengths = [len(tokenizer.encode(desc)) for desc in descriptions]

    print_description_stats(description_lengths)
    print('-' * 80)
    print_image_stats(image_paths)
    print_resolution_stats(image_paths)

In [4]:
get_stats("apps_filtered.json")

Stats for apps_filtered.json
Number of apps: 20,874
--------------------------------------------------------------------------------
Max description length: 3,544
Min description length: 7
Mean description length: 453.42
25th percentile description length: 248
50th percentile description length: 409
75th percentile description length: 614
90th percentile description length: 839
--------------------------------------------------------------------------------
Number of images: 191,838
Average number of images per app: 9.19
25th percentile number of images per app: 6
50th percentile number of images per app: 8
75th percentile number of images per app: 12
90th percentile number of images per app: 16
(25, 50, 75)th percentiles and mean resolution are shown below


X: (288, 512, 512) 412 | Y: (288, 384, 512) 409: 100%|██████████| 20874/20874 [11:03<00:00, 31.48it/s]


In [5]:
get_stats("apps.json")

Stats for apps.json
Number of apps: 61,270
--------------------------------------------------------------------------------
Max description length: 4,386
Min description length: 2
Mean description length: 376.01
25th percentile description length: 156
50th percentile description length: 320
75th percentile description length: 540
90th percentile description length: 779
--------------------------------------------------------------------------------
Number of images: 505,763
Average number of images per app: 8.25
25th percentile number of images per app: 5
50th percentile number of images per app: 7
75th percentile number of images per app: 10
90th percentile number of images per app: 15
(25, 50, 75)th percentiles and mean resolution are shown below


X: (288, 384, 512) 403 | Y: (288, 512, 512) 416: 100%|██████████| 61270/61270 [43:27<00:00, 23.50it/s]  
