In [1]:
import csv
import os
from PIL import Image
from tqdm.notebook import tqdm

# set the seed of the random numbers generator, so we can reproduce the results later
import numpy as np
np.random.seed(42)


### [Fashion Product Images](https://www.kaggle.com/paramaggarwal/fashion-product-images-small) dataset available on Kaggle.

It contains over 44,000 images of clothes and accessories with 9 labels for each image. **style.csv** contains the data labels, for the sake of simplicity, we will use only three labels:
- gender
- articheType
- baseColur

![](https://learnopencv.com/wp-content/uploads/2020/03/gt_image_labels-e1585129881369.png)

In [2]:
input_folder = './dataset'
annotation = os.path.join(input_folder, 'styles.csv')


# open annotation file and collect samples
# https://stackoverflow.com/questions/2890549/number-of-lines-in-csv-dictreader
samples = []
line_num = sum(1 for _ in open(annotation))
with open(annotation) as csv_file:

    # Parse it as CSV
    reader = csv.DictReader(csv_file)

    for row in tqdm(reader, total=line_num):

        # Dataset in Kaggle
        # https://www.kaggle.com/paramaggarwal/fashion-product-images-small
        img_id = row['id']
        gender = row['gender']
        articleType = row['articleType']
        baseColour = row['baseColour']

        # Check if the file is in place and has correct resolution with 3 channesl
        img_name = os.path.join(input_folder, 'images', str(img_id) + '.jpg')
        if os.path.exists(img_name):
            img = Image.open(img_name)
            if img.size == (60,80) and img.mode == "RGB":
                samples.append([img_name, gender, articleType, baseColour])

samples = np.asarray(samples)

  0%|          | 0/44447 [00:00<?, ?it/s]

### Split dataset

We are going to use 40,000 images, put 32,000 of them into the training set, and the rest of 8,000 we'll use for the validation.

In [3]:
def save_csv(data, path, fieldnames=['image_path', 'gender', 'articleType', 'baseColour']):
    with open(path, 'w', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(dict(zip(fieldnames, row)))

# Take 40000 samples in random order
n = 40000
inds = np.random.choice(n, n, replace=False)

# Split the data into train/val and save them as csv files
train_num = int(n * 0.8)
save_csv(samples[inds][:train_num], os.path.join(input_folder, 'train.csv'))
save_csv(samples[inds][train_num:-1], os.path.join(input_folder, 'val.csv'))