In [None]:
import os
import numpy as np
import shutil
import random
from pathlib import Path
import albumentations as A
import cv2
import tarfile
from urllib import request

In [None]:
def list_files_in_folder(image_folder):
    """Lists file names in a given directory"""
    list_of_files = []
    for file in os.listdir(image_folder):
        if os.path.isfile(os.path.join(image_folder, file)):
            list_of_files.append(file)
    return list_of_files

def create_save_dir(direct, name_subdirectory):
    if not os.path.exists(os.path.join(direct, name_subdirectory)):
        print('make dir')
        os.mkdir(os.path.join(direct, name_subdirectory))
    return os.path.join(direct, name_subdirectory)

In [None]:
# Load dataset from https://github.com/fastai/imagenette 
if not os.path.isfile('./imagenette2.tgz'):
    print('Loading data from https://github.com/fastai/imagenette under Imagenette->Full size download')
    remote_url = "https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz"
    local_folder = './imagenette2.tgz'
    request.urlretrieve(remote_url, local_folder)

In [None]:
tar = tarfile.open('./imagenette2.tgz', "r:gz")
tar.extractall()
tar.close()
Imagenette_folder = './imagenette2' 
# Rename valid folder to test folder
valid_folder = os.path.join(Imagenette_folder, 'val')
test_folder = os.path.join(Imagenette_folder, 'test')
shutil.move(valid_folder, test_folder)

In [None]:
# Take 29% from original Imagenette train set for validation set, 
# so that validation set is approximately 20% of the data 
train_folder = os.path.join(Imagenette_folder, 'train')
valid_folder = create_save_dir(Imagenette_folder, 'valid')
if not os.listdir(valid_folder):
    basePath = Path(train_folder)
    for child in basePath.iterdir():
        if child.is_dir():
            class_folder_valid = create_save_dir(valid_folder, str(child.parts[-1]))
            name_list = list_files_in_folder(str(child))
            count = len(name_list)
            to_move = np.around(0.293*count)
            count_moved=0
            for i in range(len(name_list)):
                if count_moved>=to_move:
                    break
                src = os.path.join(str(child), name_list[i])
                dst = os.path.join(class_folder_valid, name_list[i])
                shutil.move(src,dst)
                count_moved+=1


In [None]:
key_ins_class = 'n03445777' # Key instances: images of golf ball 

In [None]:
# Move all images to main folder except key instances (images of golf ball) 
def move_img_together(folder):
    basePath = Path(folder)
    for child in basePath.iterdir():
        if child.is_dir():
            if child.parts[-1]==key_ins_class: 
                continue
            name_list = list_files_in_folder(str(child))
            for i in range(len(name_list)):
                src = os.path.join(str(child), name_list[i])
                dst = os.path.join(folder, name_list[i])
                shutil.move(src, dst)

move_img_together(valid_folder)
move_img_together(train_folder)
move_img_together(test_folder)

### Create dataset with selected settings

In [None]:
transform = A.Compose([
    A.Resize(height=112, width=112, interpolation=1, always_apply=False, p=1),
    A.Rotate(limit=25, p=1.0),
    A.GaussNoise(var_limit=(2.0, 5.0), p=0.1),
    A.HorizontalFlip(p=0.5),
    A.Blur(blur_limit=5, always_apply=False, p=0.05),
    A.RandomBrightnessContrast(p=0.8),
    A.augmentations.transforms.MotionBlur(blur_limit=(3,3), p=0.05),
    A.augmentations.transforms.RGBShift(r_shift_limit=20, g_shift_limit=20, b_shift_limit=20, always_apply=False, p=0.9),
    A.augmentations.transforms.RandomFog(fog_coef_lower=0.3, fog_coef_upper=0.4, alpha_coef=0.08, always_apply=False, p=0.05),
])

In [None]:
def auxiliary_function(count_bags, i_range_min, i_range_max, max_num_bags_of_type, dir_data, 
                       max_num_inst_of_type, img_names_list, save_f):
    count_instances = 0
    for i in range(i_range_min, i_range_max): 
        if count_instances==max_num_inst_of_type: 
            count_bags+=1
            list_sampled=[]
            if count_bags>max_num_bags_of_type: 
                break
            count_instances = 0

        random_name = random.choice(img_names_list)

        src = os.path.join(dir_data, random_name) 
        bag_folder = create_save_dir(save_f, str(count_bags).zfill(4))

        dst = os.path.join(bag_folder, random_name)  
        image = cv2.imread(src)
        # Augment an image
        transformed = transform(image=image)
        transformed_image = transformed["image"]

        cv2.imwrite(dst+'_'+str(i).zfill(6)+'.jpg', transformed_image)

        count_instances+=1
        last_used_index_in_list = i    
    return last_used_index_in_list

    
def compose_dataset(dir_data, save_subfolder, num_bags, num_instances_in_bag, percent_key_instances):
    save_f = create_save_dir(save_subfolder, 'positive')
    num_key_ins_per_bag = np.ceil((num_instances_in_bag*percent_key_instances)/100)
    if num_key_ins_per_bag<1:
        print('WARNING!')

    img_names_list = list_files_in_folder(dir_data)
    random.shuffle(img_names_list)

    '''Positive bags'''
    # Negative instances in positive bags
    count_bags = 0
    i_range_min=0; i_range_max=int(1e20)
    max_num_bags_of_type = np.ceil(num_bags/2)-1
    max_num_inst_of_type = num_instances_in_bag-num_key_ins_per_bag
    last_used_index_in_list = auxiliary_function(count_bags, i_range_min, i_range_max, 
                                                             max_num_bags_of_type, dir_data, max_num_inst_of_type, 
                                                             img_names_list, save_f)

    # Key instances in positive bags       
    key_instance_list = list_files_in_folder(os.path.join(dir_data, key_ins_class))
    count_bags = 0
    count_key_ins = 0
    for i in range(int(1e20)):
        if count_key_ins==num_key_ins_per_bag:
            count_bags+=1
            list_key_sampled=[]
            if count_bags>np.ceil(num_bags/2)-1:
                break
            count_key_ins = 0

        random_name_key = random.choice(key_instance_list)

        src = os.path.join(os.path.join(dir_data, key_ins_class), random_name_key) 
        bag_folder = create_save_dir(save_f, str(count_bags).zfill(4))

        dst = os.path.join(bag_folder, random_name_key) 
        image = cv2.imread(src)
        # Augment an image
        transformed = transform(image=image)
        transformed_image = transformed["image"]
        cv2.imwrite(os.path.join(bag_folder, 'keyins'+'_'+random_name_key+'_'+str(i).zfill(6)+'.jpg'), 
                    transformed_image)
        count_key_ins+=1

    '''Negative bags'''
    save_f = create_save_dir(save_subfolder, 'negative')
    count_bags_positive = count_bags
    i_range_min = last_used_index_in_list; i_range_max = int(1e20)
    max_num_bags_of_type = count_bags_positive+np.ceil(num_bags/2)-1
    max_num_inst_of_type = num_instances_in_bag
    _ = auxiliary_function(count_bags, i_range_min, i_range_max, max_num_bags_of_type, dir_data, 
                           max_num_inst_of_type, img_names_list, save_f)        

In [None]:
# Choose settings
number_of_folds = 1 # Select how many veraions of the dataset with the same parameters to create (here referred to as fold)
num_instances_in_bag = 50 # Select number of instances in one bag
percent_key_instances = 30 # Select percent of key instances in positive bags
num_bags_train = 10 # Overall number of bags for training (num_bags_train/2 positive num_bags_train/2 negative)
num_bags_valid = 4 # Overall number of bags for validation
num_bags_test = 10 # Overall number of bags for test

for fold in range(1,number_of_folds+1):
    fold_dir = create_save_dir('./', 'fold'+str(fold))
    # Train
    dir_data = os.path.join(Imagenette_folder, 'train')
    save_folder = create_save_dir(fold_dir,'Imagenette'+'_'+str(num_bags_train).zfill(4)+'_'+
                                  str(num_instances_in_bag).zfill(4)+'_'+str(percent_key_instances).zfill(4))
    save_subfolder = create_save_dir(save_folder, 'train')
    compose_dataset(dir_data, save_subfolder, num_bags_train, num_instances_in_bag, percent_key_instances)

    # Validation
    dir_data = os.path.join(Imagenette_folder, 'valid')
    save_subfolder = create_save_dir(save_folder, 'valid')
    compose_dataset(dir_data, save_subfolder, num_bags_valid, num_instances_in_bag, percent_key_instances)

    # Test
    num_bags = num_bags_train
    dir_data = os.path.join(Imagenette_folder, 'test')
    save_subfolder = create_save_dir(save_folder, 'test')
    compose_dataset(dir_data, save_subfolder, num_bags_test, num_instances_in_bag, percent_key_instances)