# Data Preprocessing and EDA

In [1]:
import os
import shutil
import random
from tqdm import tqdm

## Splitting the Data

I will be splitting the original data as per the provided meta data list for train and val sets.

In [2]:
os.chdir('../')
meta_trainsplit_path = 'raw_data/food-101/meta/train.txt'
meta_valsplit_path = 'raw_data/food-101/meta/test.txt'
raw_images_dir = 'raw_data/food-101/images'
train_dir = 'raw_data/split_data/images/train'
val_dir = 'raw_data/split_data/images/val'

In [3]:
# Getting the training and validation split list
train_list = []
val_list = []

# Reading the train and val split file
with open(meta_trainsplit_path) as file:
    train_list.append(file.read().split('\n'))
    file.close()
train_list = train_list[0]
print(train_list[:5], len(train_list))

with open(meta_valsplit_path) as file:
    val_list.append(file.read().split('\n'))
    file.close()
val_list = val_list[0]
print(val_list[:5], len(val_list))

['apple_pie/1005649', 'apple_pie/1014775', 'apple_pie/1026328', 'apple_pie/1028787', 'apple_pie/1043283'] 75751
['apple_pie/1011328', 'apple_pie/101251', 'apple_pie/1034399', 'apple_pie/103801', 'apple_pie/1038694'] 25251


In [4]:
# Creating a train set directory
if not os.path.exists(train_dir):
    os.makedirs(train_dir)
    print(f'[INFO] Train set directory "{train_dir}" created.')
    
# Moving the files in train dir
for file_name in tqdm(train_list):
    file_path = os.path.join(raw_images_dir, file_name + '.jpg')
    class_dir = os.path.join(train_dir, file_name.split('/')[0])
    os.makedirs(class_dir, exist_ok=True)
    if os.path.isfile(file_path):
        shutil.move(file_path, class_dir)
print(f'[INFO] All the files are moved to train set directory "{train_dir}".')

[INFO] Train set directory "raw_data/split_data/images/train" created.


100%|██████████| 75751/75751 [01:29<00:00, 850.38it/s] 

[INFO] All the files are moved to train set directory "raw_data/split_data/images/train".





In [5]:
# Creating a val set directory
if not os.path.exists(val_dir):
    os.makedirs(val_dir)
    print(f'[INFO] Val set directory "{val_dir}" created.')
    
# Moving the files in Val dir
for file_name in tqdm(val_list):
    file_path = os.path.join(raw_images_dir, file_name + '.jpg')
    class_dir = os.path.join(val_dir, file_name.split('/')[0])
    os.makedirs(class_dir, exist_ok=True)
    if os.path.isfile(file_path):
        shutil.move(file_path, class_dir)
print(f'[INFO] All the files are moved to val set directory "{val_dir}".')

[INFO] Val set directory "raw_data/split_data/images/val" created.


100%|██████████| 25251/25251 [00:32<00:00, 780.02it/s] 

[INFO] All the files are moved to val set directory "raw_data/split_data/images/val".





In [6]:
# Walk through the train directory
for dirpath, dirnames, filenames in os.walk(train_dir):
    print(f'There are {len(dirnames)} directories and {len(filenames)} images in "{dirpath}".')

There are 101 directories and 0 images in "raw_data/split_data/images/train".
There are 0 directories and 750 images in "raw_data/split_data/images/train/chocolate_mousse".
There are 0 directories and 750 images in "raw_data/split_data/images/train/ceviche".
There are 0 directories and 750 images in "raw_data/split_data/images/train/shrimp_and_grits".
There are 0 directories and 750 images in "raw_data/split_data/images/train/hot_and_sour_soup".
There are 0 directories and 750 images in "raw_data/split_data/images/train/pork_chop".
There are 0 directories and 750 images in "raw_data/split_data/images/train/foie_gras".
There are 0 directories and 750 images in "raw_data/split_data/images/train/grilled_cheese_sandwich".
There are 0 directories and 750 images in "raw_data/split_data/images/train/poutine".
There are 0 directories and 750 images in "raw_data/split_data/images/train/huevos_rancheros".
There are 0 directories and 750 images in "raw_data/split_data/images/train/apple_pie".
The

In [7]:
# Walk through the val directory
for dirpath, dirnames, filenames in os.walk(val_dir):
    print(f'There are {len(dirnames)} directories and {len(filenames)} images in "{dirpath}".')

There are 101 directories and 0 images in "raw_data/split_data/images/val".
There are 0 directories and 250 images in "raw_data/split_data/images/val/chocolate_mousse".
There are 0 directories and 250 images in "raw_data/split_data/images/val/ceviche".
There are 0 directories and 250 images in "raw_data/split_data/images/val/shrimp_and_grits".
There are 0 directories and 250 images in "raw_data/split_data/images/val/hot_and_sour_soup".
There are 0 directories and 250 images in "raw_data/split_data/images/val/pork_chop".
There are 0 directories and 250 images in "raw_data/split_data/images/val/foie_gras".
There are 0 directories and 250 images in "raw_data/split_data/images/val/grilled_cheese_sandwich".
There are 0 directories and 250 images in "raw_data/split_data/images/val/poutine".
There are 0 directories and 250 images in "raw_data/split_data/images/val/huevos_rancheros".
There are 0 directories and 250 images in "raw_data/split_data/images/val/apple_pie".
There are 0 directories a

## Collecting Data for Annotation

### 1. 10% Data from 5 Classes

In [8]:
# Creating a function that moves files randomely.
def move_random_files(src_dir: str, dst_dir: str, class_list: list, files_count: int):
    """
    Moves random files from source to destination for every classes.
    Parameters:
        src_dir: str, A path of the source directory.
        dst_dir: str, A path of the destination directory.
        class_list: list, A class list containing the names of the classes to move the files from source.
        files_count: int, Total number of files that need to be moved randomely.
    """
    # Checking the dst directory and creating it
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)
        print(f'[INFO] "{dst_dir}" Directory is been created.')
        
    # Looping through classes and moving the files
    for class_names in tqdm(class_list):
        # List of the images paths
        image_path_list = [os.path.join(src_dir, class_names, i) for i in os.listdir(os.path.join(src_dir, class_names))]
        
        # Selecting the random sample
        rand_images_list = random.sample(image_path_list, files_count)
        
        # Class directory in dst dir
        dst_class_dir = os.path.join(dst_dir, class_names)
        os.makedirs(dst_class_dir)
        
        # Moving the files using the random samples list
        for file in tqdm(rand_images_list):
            if os.path.isfile(file):
                shutil.move(file, dst_class_dir)
    print(f'[INFO] All the {files_count} files for {len(class_list)} classes have been moved from "{src_dir}" to "{dst_dir}".')

In [10]:
# Selecting 5 classes
classes5_list = ['chicken_curry', 'chocolate_cake', 'hamburger', 'pizza', 'ramen']
classes5_list

['chicken_curry', 'chocolate_cake', 'hamburger', 'pizza', 'ramen']

In [11]:
train_5classes_10percent = 'raw_data/batch_data/5classes_10percent/images/train'
val_5classes_10percent = 'raw_data/batch_data/5classes_10percent/images/val'
train_image_count = int(750 * 0.1)
val_image_count = int(250 * 0.1)

In [12]:
# Collecting 10% data from train split using the function
move_random_files(src_dir=train_dir,
                  dst_dir=train_5classes_10percent,
                  class_list=classes5_list,
                  files_count=train_image_count)

[INFO] "raw_data/batch_data/5classes_10percent/images/train" Directory is been created.


  0%|          | 0/5 [00:00<?, ?it/s]
100%|██████████| 75/75 [00:00<00:00, 966.79it/s]

100%|██████████| 75/75 [00:00<00:00, 2239.62it/s]
 40%|████      | 2/5 [00:00<00:00, 14.55it/s]
100%|██████████| 75/75 [00:00<00:00, 1728.69it/s]

100%|██████████| 75/75 [00:00<00:00, 1868.42it/s]
 80%|████████  | 4/5 [00:00<00:00, 17.12it/s]
100%|██████████| 75/75 [00:00<00:00, 2214.66it/s]
100%|██████████| 5/5 [00:00<00:00, 17.62it/s]

[INFO] All the 75 files for 5 classes have been moved from "raw_data/split_data/images/train" to "raw_data/batch_data/5classes_10percent/images/train".





In [13]:
# Collecting 10% data from val split using the function
move_random_files(src_dir=val_dir,
                  dst_dir=val_5classes_10percent,
                  class_list=classes5_list,
                  files_count=val_image_count)

[INFO] "raw_data/batch_data/5classes_10percent/images/val" Directory is been created.


  0%|          | 0/5 [00:00<?, ?it/s]
100%|██████████| 25/25 [00:00<00:00, 1686.25it/s]

100%|██████████| 25/25 [00:00<00:00, 2044.84it/s]

100%|██████████| 25/25 [00:00<00:00, 2128.35it/s]

100%|██████████| 25/25 [00:00<00:00, 2160.01it/s]

100%|██████████| 25/25 [00:00<00:00, 2238.97it/s]
100%|██████████| 5/5 [00:00<00:00, 57.62it/s]

[INFO] All the 25 files for 5 classes have been moved from "raw_data/split_data/images/val" to "raw_data/batch_data/5classes_10percent/images/val".





In [14]:
# Walk through the 10% data directory
for dirpath, dirnames, filenames in os.walk('raw_data/batch_data/5classes_10percent/images'):
    print(f'There are {len(dirnames)} directories and {len(filenames)} images in "{dirpath}".')

There are 2 directories and 0 images in "raw_data/batch_data/5classes_10percent/images".
There are 5 directories and 0 images in "raw_data/batch_data/5classes_10percent/images/train".
There are 0 directories and 75 images in "raw_data/batch_data/5classes_10percent/images/train/chicken_curry".
There are 0 directories and 75 images in "raw_data/batch_data/5classes_10percent/images/train/ramen".
There are 0 directories and 75 images in "raw_data/batch_data/5classes_10percent/images/train/hamburger".
There are 0 directories and 75 images in "raw_data/batch_data/5classes_10percent/images/train/pizza".
There are 0 directories and 75 images in "raw_data/batch_data/5classes_10percent/images/train/chocolate_cake".
There are 5 directories and 0 images in "raw_data/batch_data/5classes_10percent/images/val".
There are 0 directories and 25 images in "raw_data/batch_data/5classes_10percent/images/val/chicken_curry".
There are 0 directories and 25 images in "raw_data/batch_data/5classes_10percent/ima

In [16]:
# Archiving the batch data of 10% data.
shutil.make_archive(base_name='5classes_10percent', 
                    format='zip', 
                    root_dir='raw_data/batch_data/5classes_10percent/images', 
                    verbose=1)

'/notebooks/5classes_10percent.zip'