In [2]:
############################################################################################
# IMPORTS
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
#import tensorflow.keras as keras

from tensorflow.keras import layers
from tensorflow.keras import regularizers

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, MaxPooling2D, Activation, Flatten

from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical




import pathlib
import shutil
import tempfile
import concurrent



import json
import glob





print(tf.__version__)
############################################################################################

2.18.0


In [3]:
############################################################################################
#CONFIGURATION AND CONSTANTS
# Example of the folder tree in this notebook -  ./data/iFood_2019/train_set/108-taco/val_002062.jpg

DATA_FS251 = './data/iFood_2019'
CLASSES_FILE_NAME = 'class_list_extended.csv'
ANNOTATION_OUTPUT_PATH = DATA_FS251 + '/formated_annot/'


TRAIN_INFO = 'train_info.csv'
VAL_INFO = 'val_info.csv'
TEST_INFO = 'test_info.csv'

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

############################################################################################

In [4]:
############################################################################################
# FUNCTIONS DEFINITIONS

def image_sorting_SUPERclasses(info_data=pd.DataFrame(), classes_data=pd.DataFrame(), path_str=''):
        try:
                for indx in info_data.index.tolist():
                # number of the class:
                        file_class = info_data.class_num[indx]
                # new folder is a class number (un-comment this block and comment next one):
                #        new_folder_path = DATA_FS251+'/train_set/' + str(file_class) + '/'

                # new folder is a SUPERclass name (string):
                        new_folder_path = DATA_FS251 + path_str + '/' + str(file_class).zfill(3) + '-' + classes_data.name.iloc[file_class] + '/'
                        os.makedirs( os.path.dirname(new_folder_path), exist_ok=True)
                        shutil.move(
                                DATA_FS251 + path_str + '/' + str(info_data.file_name[indx]),
                                new_folder_path+ str(info_data.file_name[indx]) 
                                )
                        del new_folder_path
        except: print(f'Image files in the {path_str} are either already sorted or missing!')
        return


In [5]:
############################################################################################
# DATA PREPARATION
df_classes = pd.read_csv(DATA_FS251+'/annot/' + CLASSES_FILE_NAME, index_col='class_id')
df_classes.columns = [val.strip().replace('-','_').replace(' ','_').lower() for val in df_classes.columns.tolist()]
df_classes.name = [str(val).strip().lower().replace(' ','_').replace('!','').replace('?','_').replace('"','-').replace("'",'-').replace("`",'-') for val in df_classes.name]

df_classes.visible_ing = [str(val).strip().lower().replace(' ','_').replace('"','-').replace("'",'-').replace('!','').replace('?','_').replace("`",'-').split(';') for val in df_classes.visible_ing]


os.makedirs( os.path.dirname(ANNOTATION_OUTPUT_PATH), exist_ok=True)
df_classes.to_csv(ANNOTATION_OUTPUT_PATH + '/classes_formated.csv')

In [6]:
############################################################################################
# TRAIN/TEST/VALIDATE info formatting

df_train_info = pd.read_csv(DATA_FS251+'/annot/' + TRAIN_INFO, names=['file_name', 'class_num'])
df_train_info.file_name = [str(val).strip() for val in df_train_info.file_name]

df_validate_info = pd.read_csv(DATA_FS251+'/annot/' + VAL_INFO, names=['file_name', 'class_num'])
df_validate_info.file_name = [str(val).strip() for val in df_validate_info.file_name]

df_test_info = pd.read_csv(DATA_FS251+'/annot/' + TEST_INFO, names=['file_name', 'class_num'])
df_test_info.file_name = [str(val).strip() for val in df_test_info.file_name]



image_sorting_SUPERclasses(info_data=df_train_info, classes_data=df_classes, path_str='/train_set')
image_sorting_SUPERclasses(info_data=df_validate_info, classes_data=df_classes, path_str='/val_set')




Code for sorting files into cathegory folders (base for **image_sorting_SUPERclasses** and **image_sorting_subCLASSES**, latter not defined here):

```python

# CREATE FOLDER TREE FOR THE TRAIN SET, MOVE IMAGES INTO IT (two subfolders - '/train_set/SuPERCLASSES/' AND '/train_set/Subclasses/')
for indx in df_train_info.index.tolist():
# number of the class:
        file_class = df_train_info.class_num[indx]
# new folder is a class number (un-comment this block and comment next one):
#        new_folder_path = DATA_FS251+'/train_set/' + str(file_class) + '/'

# new folder is a SUPERclass name (string):
        new_folder_path = DATA_FS251+'/train_set/SuPERCLASSES/' + str(file_class).zfill(3) + '-' + df_classes.name.iloc[file_class] + '/'
        os.makedirs( os.path.dirname(new_folder_path), exist_ok=True)
        shutil.copyfile(
                DATA_FS251 + '/train_set/' + str(df_train_info.file_name[indx]),
                new_folder_path+ str(df_train_info.file_name[indx]) 
                )
        del new_folder_path

# new folder is a SUBclass name:
        for subcls in df_classes.visible_ing.iloc[file_class]:
                new_folder_path = DATA_FS251+'/train_set/Subclasses/' + subcls + '/'
                os.makedirs( os.path.dirname(new_folder_path), exist_ok=True)
                shutil.copyfile(DATA_FS251 + '/train_set/' + str(df_train_info.file_name[indx]), new_folder_path+ str(df_train_info.file_name[indx]) )



#       Deleteing images from data root (un-commentline below)
#        pathlib.Path( DATA_FS251 + '/train_set/' + str(df_train_info.file_name[indx]) ).unlink(missing_ok=True)

        
# make folder, move images (for folders = nums or classes):        
#        os.makedirs( os.path.dirname(new_folder_path), exist_ok=True)
#        shutil.move(
#                DATA_FS251 + '/train_set/' + str(df_train_info.file_name[indx]),
#                new_folder_path+ str(df_train_info.file_name[indx]) 
#                )

```

The code below is folder tree creator (copying) **with** ```glob```. It appears to be **much slower** (4+ min. instead of 20 sec on 120,000 images!), but I keep it here just in case.

```python

for image in glob.glob(DATA_FS251+'/train_set/*.jpg'):
# number of the class:
    file_class = df_train_info.class_num[ df_train_info.file_name ==  str(image).split('/')[len(str(image).split('/'))-1]  ]
    
# new folder is a class number (un-comment this block and comment next one):
#    new_folder_path = DATA_FS251+'/train_set/' + str( file_class ) + '/'
    
# new folder is a class name (string):
    new_folder_path = DATA_FS251+'/train_set/' + df_classes.name[int(file_class)] + '/'
    os.makedirs(os.path.dirname(new_folder_path), exist_ok=True)
    
    shutil.move(image, new_folder_path + str(image).split('/')[ len(str(image).split('/'))-1  ] )

```