# Data reduction

## Imports

In [1]:
# %load_ext autoreload
# %autoreload 2

import os
import json
import pandas as pd
import numpy as np
import shutil
import subprocess


## Util functions

In [2]:
# Save data to json file
def save_json(filepath, data):
    """Save data to json file"""
    with open(filepath, 'w') as f:
        json.dump(data, f)


def get_dir_size(dir_path):
    """Get directory size"""
    return subprocess.check_output(['du','-sh', dir_path]).split()[0].decode('utf-8')


def reduce_data(json_file, img_dir, new_img_dir, new_json_file, n_choice=1000):
    """Reduce data to N images and save to new json file and folder"""

    print(f"Reducing data to {n_choice} images...")
    print(f"Original Image-dir size: {get_dir_size(img_dir)} ({img_dir})")
    
    # Load json file
    # data.keys() => dict_keys(['annotations', 'images', 'info', 'licenses', 'categories', 'attributes'])
    print(f"Original JSON file size: {os.path.getsize(json_file)/(1024**2):.2f}MB ({json_file})")
    with open(json_file) as json_data:
        data = json.load(json_data)

    # Get image/annotations data and remove unnecessary columns and parse to DataFrame
    df_annots = pd.DataFrame(data['annotations'])
    df_images = pd.DataFrame(data['images'])[['id', 'width', 'height', 'file_name']]

    # Get unique image and category IDs
    unique_imgs = np.unique(df_images.id)
    # unique_cats = np.unique(df_annots.category_id)

    # Randomly select N images
    img_choice = np.random.choice(unique_imgs, n_choice, replace=False)

    # Filter data
    df_annots_new = df_annots[df_annots.image_id.isin(img_choice)]
    df_images_new = df_images[df_images.id.isin(img_choice)]

    # Create new data dictionary
    new_data = {}
    new_data["annotations"] = df_annots_new.to_dict(orient='records')
    new_data["images"] = df_images_new.to_dict(orient='records')
    new_data["categories"] = data["categories"]

    # Copy images to new folder
    os.makedirs(new_img_dir, exist_ok=True)
    print("Copying images to new folder...")
    for i, row in df_images_new.iterrows():
        
        # Get image name and old/new paths
        img_name = row.file_name
        img_path = os.path.join(img_dir, img_name)
        new_img_path = os.path.join(new_img_dir, img_name)
        
        # Copy image to new folder
        shutil.copyfile(img_path, new_img_path)

    print(f"New Image-dir size: {get_dir_size(new_img_dir)} ({new_img_dir})")
    
    # Save new data to json file
    save_json(new_json_file, new_data)
    print(f"New JSON file size: {os.path.getsize(new_json_file)/(1024**2):.2f}MB ({new_json_file})")


## Data directories

In [8]:
DATA_DIR = "_data/"

# Data Directories
fashion_dir = os.path.join(DATA_DIR, "fashionpedia/")
annotations_dir = os.path.join(fashion_dir, "ann_dir/")
images_dir = os.path.join(fashion_dir, "img_dir/")
img_train_dir = os.path.join(images_dir, "train/")
img_test_dir = os.path.join(images_dir, "val/")

# JSON Files
train_file = os.path.join(annotations_dir, "instances_attributes_train2020.json")
test_file = os.path.join(annotations_dir, "instances_attributes_val2020.json")

# Show directory info
print(f"Train images: {len(os.listdir(img_train_dir))}")
print(f"Test Images: {len(os.listdir(img_test_dir))}")
print(f"Train images size: {get_dir_size(img_train_dir)}")
print(f"Test images size: {get_dir_size(img_test_dir)}")
print(f"Train JSON size: {os.path.getsize(train_file)/(1024**2):.2f}M")
print(f"Test JSON Size: {os.path.getsize(test_file)/(1024**2):.2f}M")

Train images: 45623
Test Images: 3200
Train images size: 3.3G
Test images size: 236M
Train JSON size: 517.08M
Test JSON Size: 13.86M


## Create new smaller dataset

In [9]:
n_choice = 1000
json_file = train_file
img_dir = img_train_dir
new_img_dir = 'small_data/'
new_json_file = new_img_dir + 'train_small.json'

reduce_data(json_file, img_dir, new_img_dir, new_json_file, n_choice)


Reducing data to 1000 images...
Original Image-dir size: 3.3G (_data/fashionpedia/img_dir/train/)
Original JSON file size: 517.08MB (_data/fashionpedia/ann_dir/instances_attributes_train2020.json)
Copying images to new folder...
New Image-dir size: 74M (small_data/)
New JSON file size: 4.33MB (small_data/train_small.json)


In [10]:
!open $new_img_dir

"open" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
