# Resizing images

In order to make the dataset more manageable we size down the retrieval images to having width 300 and preserving the original aspect ratio, the customer images will remain the original size since cropping will be applied at a later stage before feeding them to a model.

In [1]:
import os
import shutil
import pandas as pd
from PIL import Image
from tqdm import tqdm_notebook as tqdm
%matplotlib inline

Using TensorFlow backend.


## Creating customer and retrieval dataframes

Files are separated into 3 classes: retrieval, train and test. Each of these classes have a json for each of the 11 clothing categories.  The function below merges all categories json files under the 3 classes, then train and test are also merged so we can do a custom data split.

In [21]:
path_labels = "./street2shop/meta/json"
def format_labels(path_labels, shop_images=False):
    
    json_files = os.listdir(path_labels)
    json_files = [os.path.join(path_labels, file) for file in json_files] #reading in all json files
    if shop_images==False:
        all_files = [file for file in json_files if "retrieval" in file]
    else:
        all_files = [file for file in json_files if ("train" in file) | ("test" in file)]
    
    files_df = pd.DataFrame()
    for file in all_files:
        files_df = files_df.append(pd.read_json(file))
    
    category_files = [file.split("_")[-1].split(".json")[0] for file in all_files]
    print("Categories: ", category_files)

    category_nrows = [pd.read_json(file).shape[0] for file in all_files]
    print("Categories number of rows: ", category_nrows)
    
    files_df["category"] = ""
    for n, category in enumerate(category_files):
        if n == 0:
            files_df["category"].iloc[0:category_nrows[0]] = category
        index_0 = sum(category_nrows[:n])
        index_1 = sum(category_nrows[:n+1])
        files_df["category"].iloc[index_0:index_1] = category
    files_df = files_df.reset_index(drop=True)
    
    files_df["id"] = files_df["product"].astype(str) + "_" + files_df["category"] #creating key for pair matching
    
    if shop_images == True:
        files_df["bbox"] = files_df["bbox"].apply(lambda x: {k:v for k, v in sorted(x.items())}) #fixes missalignment in label order
        files_df["bbox"] = files_df["bbox"].apply(lambda x: "".join(map(lambda x: str(x) + ",", list(x.values()))))

        bboxes = files_df["bbox"].str.split(",",expand=True).drop(columns=[4])
        bboxes.columns=(["height", "left", "top", "width"])
        files_df = pd.concat([files_df,bboxes], axis=1).drop(columns=["bbox"])
        return files_df
    return files_df

In [22]:
customer_df = format_labels(path_labels, shop_images=True)

Categories:  ['bags', 'belts', 'dresses', 'eyewear', 'footwear', 'hats', 'leggings', 'outerwear', 'pants', 'skirts', 'tops', 'bags', 'belts', 'dresses', 'eyewear', 'footwear', 'hats', 'leggings', 'outerwear', 'pants', 'skirts', 'tops']
Categories number of rows:  [174, 89, 3292, 138, 2178, 86, 517, 666, 130, 604, 763, 579, 235, 12875, 358, 6486, 400, 1641, 1945, 600, 3337, 2173]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [23]:
customer_list = customer_df["photo"].unique().tolist()

## Resizing
~ 5h runtime

In [41]:
def image_resize(dataset_path, output_path, customer_list, width=300):
    """Resizing fixed to width 300 and same aspect ratio due to having high variability in current img size.
    Resizing only applies to retrieval images."""
    
    all_paths = os.listdir(dataset_path)
    for n, img_ in enumerate(tqdm(all_paths)):
        try:
            img_path = os.path.join(dataset_path, img_)
            img_object = Image.open(img_path)
            img_object = img_object.convert("RGB") #exception with transparent channel, see https://stackoverflow.com/questions/48248405/cannot-write-mode-rgba-as-jpeg
            if int(img_.split(".")[0]) not in customer_list: #checking if image is not from customer and resizing
                width_percent = (width/float(img_object.size[0]))
                height_size = int((float(img_object.size[1])*float(width_percent)))
                img_object = img_object.resize((width,height_size), Image.ANTIALIAS)
            output_img = os.path.join(output_path, img_)
            img_object.save(output_img)

        except OSError: #corrupted images will break it
            pass

In [42]:
%%time
image_resize("./../photos", "./../photos_resized", customer_list)