In [49]:
import os

import pandas as pd

dataset = pd.read_csv("datasets/deep-fashion/In-shop Clothes Retrieval Benchmark/Eval/list_eval_partition.txt",
                      delim_whitespace=True)
dataset = dataset[dataset["evaluation_status"] == "query"]
dataset["is_included"] = True
dataset

Unnamed: 0,image_name,item_id,evaluation_status,is_included
25883,img/WOMEN/Blouses_Shirts/id_00000001/02_2_side...,id_00000001,query,True
25885,img/WOMEN/Blouses_Shirts/id_00000001/02_4_full...,id_00000001,query,True
25887,img/WOMEN/Tees_Tanks/id_00000007/01_2_side.jpg,id_00000007,query,True
25889,img/WOMEN/Tees_Tanks/id_00000007/01_4_full.jpg,id_00000007,query,True
25890,img/WOMEN/Tees_Tanks/id_00000007/01_6_flat.jpg,id_00000007,query,True
...,...,...,...,...
52703,img/WOMEN/Jackets_Coats/id_00007982/11_7_addit...,id_00007982,query,True
52704,img/WOMEN/Jackets_Coats/id_00007982/12_1_front...,id_00007982,query,True
52708,img/WOMEN/Jackets_Coats/id_00007982/13_1_front...,id_00007982,query,True
52710,img/WOMEN/Jackets_Coats/id_00007982/13_3_back.jpg,id_00007982,query,True


Remove missing image from list of images

In [50]:
dataset = dataset[dataset["item_id"] != "id_00007982"]

## Remove all images from test dataset that do not include a face

In [66]:
from deepface import DeepFace
from tqdm import tqdm
dataset['has_face'] = False
for index, row in tqdm(dataset[dataset["is_included"] == True].iterrows()):
    hires_path = row["image_name"].replace("img", "img_highres")    
    detected_faces = DeepFace.extract_faces("datasets/deep-fashion/In-shop Clothes Retrieval Benchmark/Img/" + hires_path, enforce_detection=False, align=False)
    face_detected = not all(face["confidence"] < 5 for face in detected_faces)
    dataset["has_face"][index] = face_detected
    if not face_detected:
        dataset["is_included"][index] = False
dataset

568it [00:18, 30.36it/s]


Unnamed: 0,image_name,item_id,evaluation_status,is_included,has_face,has_mask,is_man
25883,img/WOMEN/Blouses_Shirts/id_00000001/02_2_side...,id_00000001,query,False,False,False,False
25885,img/WOMEN/Blouses_Shirts/id_00000001/02_4_full...,id_00000001,query,False,False,False,False
25887,img/WOMEN/Tees_Tanks/id_00000007/01_2_side.jpg,id_00000007,query,False,False,False,False
25889,img/WOMEN/Tees_Tanks/id_00000007/01_4_full.jpg,id_00000007,query,False,False,False,False
25890,img/WOMEN/Tees_Tanks/id_00000007/01_6_flat.jpg,id_00000007,query,False,False,False,False
...,...,...,...,...,...,...,...
52694,img/WOMEN/Dresses/id_00007980/05_7_additional.jpg,id_00007980,query,False,False,False,False
52696,img/WOMEN/Dresses/id_00007980/06_2_side.jpg,id_00007980,query,False,False,False,False
52697,img/WOMEN/Dresses/id_00007980/06_3_back.jpg,id_00007980,query,False,False,False,False
52698,img/WOMEN/Dresses/id_00007980/06_4_full.jpg,id_00007980,query,False,False,False,False


## Remove all images from test dataset that do not provide a ground truth mask

In [67]:
dataset["has_mask"] = False
for index, row in tqdm(dataset[dataset["is_included"] == True].iterrows()):
    hires_path = row["image_name"].replace("img", "img_highres")
    hires_path_mask = row["image_name"].replace("img", "img_highres").replace(".jpg", "_mask.png")
    mask_exists = os.path.exists("datasets/deep-fashion/In-shop Clothes Retrieval Benchmark/Img/" + hires_path_mask)
    dataset["has_mask"][index] = mask_exists
    if not mask_exists:
        dataset["is_included"][index] = False
dataset

568it [00:00, 7206.88it/s]


Unnamed: 0,image_name,item_id,evaluation_status,is_included,has_face,has_mask,is_man
25883,img/WOMEN/Blouses_Shirts/id_00000001/02_2_side...,id_00000001,query,False,False,False,False
25885,img/WOMEN/Blouses_Shirts/id_00000001/02_4_full...,id_00000001,query,False,False,False,False
25887,img/WOMEN/Tees_Tanks/id_00000007/01_2_side.jpg,id_00000007,query,False,False,False,False
25889,img/WOMEN/Tees_Tanks/id_00000007/01_4_full.jpg,id_00000007,query,False,False,False,False
25890,img/WOMEN/Tees_Tanks/id_00000007/01_6_flat.jpg,id_00000007,query,False,False,False,False
...,...,...,...,...,...,...,...
52694,img/WOMEN/Dresses/id_00007980/05_7_additional.jpg,id_00007980,query,False,False,False,False
52696,img/WOMEN/Dresses/id_00007980/06_2_side.jpg,id_00007980,query,False,False,False,False
52697,img/WOMEN/Dresses/id_00007980/06_3_back.jpg,id_00007980,query,False,False,False,False
52698,img/WOMEN/Dresses/id_00007980/06_4_full.jpg,id_00007980,query,False,False,False,False


## Equalize the number of images for MEN and WOMAN images

In [68]:
import numpy as np

dataset["is_man"] = False
for index, row in tqdm(dataset.iterrows()):
    is_woman = "WOMEN" in row["image_name"]
    dataset["is_man"][index] = not is_woman

14212it [00:01, 11653.28it/s]


assuming there are less MEN images

In [69]:
len_MEN = len(dataset[dataset["is_man"] == True][dataset["is_included"] == True])
len_WOMEN = len(dataset[dataset["is_man"] == False][dataset["is_included"] == True])
print("MEN", len_MEN)
print("WOMEN", len_WOMEN)

MEN 263
WOMEN 305


In [70]:
amount_WOMAN_to_ignore = len_WOMEN - len_MEN
for index, row in tqdm(dataset[dataset["is_included"] == True].iterrows()):
    if amount_WOMAN_to_ignore > 0 and not row["is_man"]:
        dataset["is_included"][index] = False
        amount_WOMAN_to_ignore -= 1
len_MEN = len(dataset[dataset["is_man"] == True][dataset["is_included"] == True])
len_WOMEN = len(dataset[dataset["is_man"] == False][dataset["is_included"] == True])
print("MEN", len_MEN)
print("WOMEN", len_WOMEN)

568it [00:00, 28969.15it/s]

MEN 263
WOMEN 263





In [71]:
len_MEN = len(dataset[dataset["is_man"] == True][dataset["is_included"] == True])
len_WOMEN = len(dataset[dataset["is_man"] == False][dataset["is_included"] == True])
print("MEN", len_MEN)
print("WOMEN", len_WOMEN)

MEN 263
WOMEN 263


In [1]:
test_dataset = pd.DataFrame()
test_dataset["image_name"] = dataset[dataset["is_included"] == True]["image_name"]
test_dataset["image_name"] = test_dataset["image_name"].transform(lambda name: name.replace("img", "img_highres"))
test_dataset = test_dataset.reset_index(drop=True)
test_dataset

NameError: name 'pd' is not defined

## Save new test dataset

In [73]:
test_dataset.to_csv("datasets/query_list.csv")
test_dataset

Unnamed: 0,image_name
0,img_highres/MEN/Jackets_Vests/id_00000084/08_1...
1,img_highres/MEN/Jackets_Vests/id_00000094/01_1...
2,img_highres/MEN/Jackets_Vests/id_00000094/04_1...
3,img_highres/MEN/Sweaters/id_00000145/01_1_fron...
4,img_highres/MEN/Shirts_Polos/id_00000193/04_1_...
...,...
521,img_highres/WOMEN/Tees_Tanks/id_00007962/07_1_...
522,img_highres/WOMEN/Tees_Tanks/id_00007962/07_2_...
523,img_highres/MEN/Tees_Tanks/id_00007967/02_2_si...
524,img_highres/WOMEN/Tees_Tanks/id_00007969/03_1_...
