# Imports

In [None]:
import os
import random
import os
import cv2
import math
import glob
import shutil
import pandas.util

#the following five imports have to be installed
from PIL import Image
import numpy as np
import tensorflow as tf
import pandas as pd
from tqdm import tqdm

# Hyperparameters

In [None]:
IMAGE_FOLDER = 'images/images_classification/'

TEST_FOLDER = 'images/test/'
TRAIN_FOLDER = 'images/train/'
VAL_FOLDER = 'images/val/'

NUM_TRANSLATIONS = 9 # Number of random tranlations that are applied to each segment
NUM_SEGMENTS = 3     # An odd-numbered number of segments that determines which parts are being cut.
                     # The segments overlie one another to 50%. 
        
VAL_TRAIN_RATIO = 1/10 #ratio of images in the validation dataset to images in the train dataset

TEST_PKL = 'files/classification_test.pkl'
TRAIN_PKL = 'files/classification_train.pkl'
VAL_PKL = 'files/classification_val.pkl'

## Check if all files and folder exist 

In [None]:
#check if the IMAGE_FOLDER exists, if not: raise an error.
if not os.path.exists(IMAGE_FOLDER):
    raise FileNotFoundError('The preprocessing of the images (classification_image_preprocessing.ipynb) has to be conducted first or the name of the IMAGE_FOLDER has to be adjusted.')

#check if the directories TEST_FOLDER, TRAIN_FOLDER, VAL_FOLDER exist, if not: create them.
#Otherwise delete its contents
if not os.path.exists(TEST_FOLDER):
    os.mkdir(TEST_FOLDER)
else:
    shutil.rmtree(TEST_FOLDER)
    os.mkdir(TEST_FOLDER)
    
if not os.path.exists(TRAIN_FOLDER):
    os.makedirs(TRAIN_FOLDER)
else:
    shutil.rmtree(TRAIN_FOLDER)
    os.mkdir(TRAIN_FOLDER)
    
if not os.path.exists(VAL_FOLDER):
    os.makedirs(VAL_FOLDER)
else:
    shutil.rmtree(VAL_FOLDER)
    os.mkdir(VAL_FOLDER)

## write picture information in file

In [None]:
filelist = os.listdir(IMAGE_FOLDER)

f= open("images.txt","w+")
for i in range(len(filelist)):
    f.write(f"{filelist[i]};{filelist[i][3:6]};{IMAGE_FOLDER}{filelist[i]}\n")
f.close()

## Create panda dataframe from textfile

In [None]:
df = pd.read_csv("images.txt", sep = ';', names = ['filename','classname', 'image_path'], encoding = 'unicode_escape')
df = df.set_index('filename')

print(df.head())
print("shape of data frame: ", df.shape)

# Determine test images

In [None]:
image_paths = glob.glob(IMAGE_FOLDER + '*.jpg')
images = np.array([key[len(IMAGE_FOLDER):-len("_0_s0_t0.jpg")] for key in image_paths]) #get all image names without information of number of translation, segment and clone
genet, num_genet = np.unique(images, return_counts=True) # get all unique genets but also count their number in 'images'
num_genet = num_genet // (NUM_SEGMENTS * (NUM_TRANSLATIONS+1)) #floor division of that number by the number of translations and segments to get the original number
lcm_count = np.lcm.reduce(num_genet) #calculate the lowest common multiple of all numbers of genets
test_idx = np.random.randint(low=1, high=lcm_count, size=genet.shape) #define the interval of the number of images that are randomly chosen for the dataset
test_idx = (test_idx % num_genet) + 1 #modulo calculation to respect the number of genets in the choice

#get the test images by iterating over the index and the number of segments and translations
test_images = [f"{genet[i]}_{test_idx[i]}_s{s}_t{t}.jpg" for i in range(test_idx.shape[0]) for s in range(NUM_SEGMENTS) for t in range(NUM_TRANSLATIONS+1)]

# Determine train and validation images and create training, validation and test dataframes

In [None]:
# split data further into training, validation data frames
all_filenames = [image_path[len(IMAGE_FOLDER):] for image_path in image_paths] #get all filenames
non_test_filenames = [image for image in all_filenames if image not in test_images] #get all filenames apart from the ones that belong to test images
random.shuffle(non_test_filenames) #shuffle these filenames randomly

num_val = int(VAL_TRAIN_RATIO * len(non_test_filenames)) #set the number of images in the validation data set

val_images = non_test_filenames[:num_val] #get validation images
train_images = non_test_filenames[num_val:] #get training images

# construct training, validation and testing data frames 
test_df = df.loc[test_images]
val_df = df.loc[val_images]
train_df = df.loc[train_images]

# Function for image augmentation

In [None]:
def augmentation (filename, path, outfile):
    img = tf.io.read_file(path)
    img = tf.image.decode_png(img, channels = 3)
    
    len_end = len(".jpg")
    
    img_brightness = tf.image.random_brightness(img, max_delta = 0.2)
    output_image = tf.image.encode_png(img_brightness)
    tf.io.write_file(tf.constant(outfile+filename[:-len_end]+"brightness1.jpg"), output_image)
    
    img_brightness = tf.image.random_brightness(img, max_delta = 0.2)
    output_image = tf.image.encode_png(img_brightness)
    tf.io.write_file(tf.constant(outfile+filename[0:-len_end]+"brightness2.jpg"), output_image)

    img_saturation = tf.image.random_saturation(img, lower = 0.95, upper = 1.05)
    output_image = tf.image.encode_png(img_saturation)
    tf.io.write_file(tf.constant(outfile+filename[0:-len_end]+"saturation1.jpg"), output_image)
    
    img_saturation = tf.image.random_saturation(img, lower = 0.95, upper = 1.05)
    output_image = tf.image.encode_png(img_saturation)
    tf.io.write_file(tf.constant(outfile+filename[0:-len_end]+"saturation2.jpg"), output_image)
    
    img_hue = tf.image.random_hue(img, max_delta = 0.2)
    output_image = tf.image.encode_png(img_hue)
    tf.io.write_file(tf.constant(outfile+filename[0:-len_end]+"hue1.jpg"), output_image)
    
    img_hue = tf.image.random_hue(img, max_delta = 0.2)
    output_image = tf.image.encode_png(img_hue)
    tf.io.write_file(tf.constant(outfile+filename[0:-len_end]+"hue2.jpg"), output_image)
    
    img_gray = tf.image.rgb_to_grayscale(img)
    output_image = tf.image.encode_png(img_gray)
    tf.io.write_file(tf.constant(outfile+filename[0:-len_end]+"gray.jpg"), output_image)
    
    img_contrast = tf.image.adjust_contrast(img,0.6)
    output_image = tf.image.encode_png(img_contrast)
    tf.io.write_file(tf.constant(outfile+filename[0:-len_end]+"contrast.jpg"), output_image)
                     
    img = tf.image.encode_png(img)
    tf.io.write_file(tf.constant(outfile+filename), img) #copy image to same folder

# Augmentation of images 
## (only training images)

In [None]:
for i in tqdm(range(0,len(test_images)), desc = "Copy test images", unit = "images"):
    filename = test_images[i]
    path = IMAGE_FOLDER + filename
    img = tf.io.read_file(path)
    tf.io.write_file(tf.constant(TEST_FOLDER+filename), img)
    
for i in tqdm(range(0,len(val_images)), desc = "Copy val images ", unit = "images"):
    filename = val_images[i]
    path = IMAGE_FOLDER + filename
    img = tf.io.read_file(path)
    tf.io.write_file(tf.constant(VAL_FOLDER+filename), img)
    
for i in tqdm(range(0,len(train_images)), desc = "Augment train images", unit = "images"):
    filename = train_images[i]
    augmentation(filename, IMAGE_FOLDER+filename, TRAIN_FOLDER)

## Write new images in to text file and read into panda dataframe

In [None]:
test_files = os.listdir(TEST_FOLDER)
val_files = os.listdir(VAL_FOLDER)
train_files = os.listdir(TRAIN_FOLDER)

random.shuffle(train_files)

f= open("test.txt","w+")
for i in range(len(test_files)):
    f.write(f"{test_files[i]};{test_files[i][3:6]};{TEST_FOLDER}{test_files[i]}\n")
f.close()

f= open("val.txt","w+")
for i in range(len(val_files)):
    f.write(f"{val_files[i]};{val_files[i][3:6]};{VAL_FOLDER}{val_files[i]}\n")
f.close()

f= open("train.txt","w+")
for i in range(len(train_files)):
    f.write(f"{train_files[i]};{train_files[i][3:6]};{TRAIN_FOLDER}{train_files[i]}\n")
f.close()

# Create a pandas dataframe from a tab separated file 
test_df = pd.read_csv("test.txt", sep = ';', names = ['filename','classname','image_path'], encoding = 'unicode_escape')
test_df = test_df.set_index('filename')

val_df = pd.read_csv("val.txt", sep = ';', names = ['filename','classname','image_path'], encoding = 'unicode_escape')
val_df = val_df.set_index('filename')

train_df = pd.read_csv("train.txt", sep = ';', names = ['filename','classname', 'image_path'], encoding = 'unicode_escape')
train_df = train_df.set_index('filename')

## Add 'class' column

In [None]:
# associate class names with a class (0 .. K-1)
classnames = train_df['classname'].unique() # all 6 species names
K = classnames.size  # 6
name2class = dict(zip(classnames, range(K))) # dictionary that maps a name to its index in classnames array
print("names and classes:", name2class)

# Add a column 'class' to data frame  with the number representing the species name
test_df['class'] = test_df['classname'].map(name2class) # new column class with number representing plant name

# Add a column 'class' to data frame  with the number representing the species name
val_df['class'] = val_df['classname'].map(name2class) # new column class with number representing plant name
# Add a column 'class' to data frame  with the number representing the species name
train_df['class'] = train_df['classname'].map(name2class) # new column class with number representing plant name

# Save dataframes for further use

In [None]:
test_df.to_pickle(TEST_PKL)
val_df.to_pickle(VAL_PKL)
train_df.to_pickle(TRAIN_PKL)