# Imports

In [None]:
#the following five imports have to be installed
from PIL import Image
import numpy as np
import tensorflow as tf
import pandas as pd
from tqdm import tqdm

import pandas.util
import os
import random
import shutil

# Hyperparameter

In [None]:
IMAGE_FOLDER = "images/images_landmarks/"
LANDMARK_PATH = "files/preprocessed_landmarks.npy"

TEST_FOLDER = 'images/test/'
TRAIN_FOLDER ='images/train/'
VAL_FOLDER = 'images/val/'

NUM_TRANSLATIONS = 6

TEST_PKL = "files/test_landmarks_df.pkl"
TRAIN_PKL = "files/train_landmarks_df.pkl"
VAL_PKL = "files/val_landmarks_df.pkl"

In [None]:
#check if the IMAGE_FOLDER exists, if not: raise an error.
if not os.path.exists(IMAGE_FOLDER):
    raise FileNotFoundError('The preprocessing of the images (landmarks_image_preprocessing.ipynb) has to be conducted first or the name of the IMAGE_FOLDER has to be adjusted.')

#check if the LANDMARK_PATH exists, if not: raise an error.
if not os.path.exists(LANDMARK_PATH):
    raise FileNotFoundError('The preprocessing of the images (landmarks_image_preprocessing.ipynb) has to be conducted first or the name of the LANDMARK_PATH has to be adjusted.')
    
#check if the directories TEST_FOLDER, TRAIN_FOLDER, VAL_FOLDER exist, if not: create them.
#Otherwise delete its contents
if not os.path.exists(TEST_FOLDER):
    os.mkdir(TEST_FOLDER)
else:
    shutil.rmtree(TEST_FOLDER)
    os.mkdir(TEST_FOLDER)
    
if not os.path.exists(TRAIN_FOLDER):
    os.mkdir(TRAIN_FOLDER)
else:
    shutil.rmtree(TRAIN_FOLDER)
    os.mkdir(TRAIN_FOLDER)
    
if not os.path.exists(VAL_FOLDER):
    os.mkdir(VAL_FOLDER)
else:
    shutil.rmtree(VAL_FOLDER)
    os.mkdir(VAL_FOLDER) 

## Load coordinate information of landmarks and filenames

In [None]:
landmarks = np.load(LANDMARK_PATH, allow_pickle = True)[()].copy()

# create a dataframe
df_landmarks = pd.DataFrame.from_dict(landmarks, orient = 'index')
df_landmarks['filename'] = df_landmarks.index

# all unique filenames
unique_files = list({key[:-len("_t0.jpg")] for key in landmarks.keys()})
random.shuffle(unique_files) #randomly shuffle these filenames

# write picture information in file

In [None]:
filelist = os.listdir(IMAGE_FOLDER)

f= open("images.txt","w+")
for i in range(len(filelist)):
    f.write(f"{filelist[i]};{filelist[i][3:6]};{IMAGE_FOLDER}{filelist[i]}\n")
f.close()

# Create panda dataframe from textfile

In [None]:
df = pd.read_csv("images.txt", sep = ';', names = ['filename','classname', 'image_path'], encoding = 'unicode_escape')
df = df.set_index('filename')

print(df.head())
print("shape of data frame: ", df.shape)

## Concatenate both dataframes

In [None]:
frames = [df, df_landmarks]
df_all = pd.concat(frames, axis = 1, sort = False)

# create training, validation and test dataframes

In [None]:
# randomly split data frame into training, validation and test data frames
num_imgs  = len(unique_files) # total number of examples
num_test  = 100# size of test set, used only once at end
num_val   = 100 # size of validation set, used to monitor training progress
num_train = num_imgs - num_test - num_val # size of training set, the (large) rest

assert num_train > 0, "Error: examples consumed by test and validation sets alone"

all_filenames = [f"{filename}_t{i}.jpg" for filename in unique_files for i in range(NUM_TRANSLATIONS)]

test_indices = all_filenames[:NUM_TRANSLATIONS*num_test]
val_indicies = all_filenames[NUM_TRANSLATIONS*num_test:NUM_TRANSLATIONS*(num_test+num_val)]
train_indicies = all_filenames[NUM_TRANSLATIONS*(num_test+num_val):]

# construct training and testing data frames 
test1_df = df_all.loc[test_indices]
val1_df = df_all.loc[val_indicies]
train1_df = df_all.loc[train_indicies]

## Function for image augmentation

In [None]:
def augmentation (filename, path, outfile):
    img = tf.io.read_file(path)
    img = tf.image.decode_png(img, channels = 3)
    
    img_brightness = tf.image.random_brightness(img, max_delta = 0.2)
    output_image = tf.image.encode_png(img_brightness)
    tf.io.write_file(tf.constant(outfile+filename[0:-4]+"brightness1.jpg"), output_image)
    landmarks[filename[0:-4]+"brightness1.jpg"] = landmarks[filename]
    
    img_brightness = tf.image.random_brightness(img, max_delta = 0.2)
    output_image = tf.image.encode_png(img_brightness)
    tf.io.write_file(tf.constant(outfile+filename[0:-4]+"brightness2.jpg"), output_image)
    landmarks[filename[0:-4]+"brightness2.jpg"] = landmarks[filename]

    img_saturation = tf.image.random_saturation(img, lower = 0.95, upper = 1.05)
    output_image = tf.image.encode_png(img_saturation)
    tf.io.write_file(tf.constant(outfile+filename[0:-4]+"saturation1.jpg"), output_image)
    landmarks[filename[0:-4]+"saturation1.jpg"] = landmarks[filename]
    
    img_saturation = tf.image.random_saturation(img, lower = 0.95, upper = 1.05)
    output_image = tf.image.encode_png(img_saturation)
    tf.io.write_file(tf.constant(outfile+filename[0:-4]+"saturation2.jpg"), output_image)
    landmarks[filename[0:-4]+"saturation2.jpg"] = landmarks[filename]
    
    img_hue = tf.image.random_hue(img, max_delta = 0.2)
    output_image = tf.image.encode_png(img_hue)
    tf.io.write_file(tf.constant(outfile+filename[0:-4]+"hue1.jpg"), output_image)
    landmarks[filename[0:-4]+"hue1.jpg"] = landmarks[filename]
    
    img_hue = tf.image.random_hue(img, max_delta = 0.2)
    output_image = tf.image.encode_png(img_hue)
    tf.io.write_file(tf.constant(outfile+filename[0:-4]+"hue2.jpg"), output_image)
    landmarks[filename[0:-4]+"hue2.jpg"] = landmarks[filename]
    
    img_gray = tf.image.rgb_to_grayscale(img)
    output_image = tf.image.encode_png(img_gray)
    tf.io.write_file(tf.constant(outfile+filename[0:-4]+"gray.jpg"), output_image)
    landmarks[filename[0:-4]+"gray.jpg"] = landmarks[filename]
    
    img_contrast = tf.image.adjust_contrast(img,0.6)
    output_image = tf.image.encode_png(img_contrast)
    tf.io.write_file(tf.constant(outfile+filename[0:-4]+"contrast.jpg"), output_image)
    landmarks[filename[0:-4]+"contrast.jpg"] = landmarks[filename]
                     
    img = tf.image.encode_png(img)
    tf.io.write_file(tf.constant(outfile+filename), img) #copy image to same folder

# augmentation of images 
## (except test and val images)

In [None]:
#Testbilder müssen nicht augmentiert werden
for i in tqdm(range(0,len(test1_df)), desc = "Copy test images", unit = "images"):
    filename = test1_df['filename'][i]
    path = test1_df['image_path'][i]
    img = tf.io.read_file(path)
    tf.io.write_file(tf.constant(TEST_FOLDER+filename), img)
    
for i in tqdm(range(0,len(val1_df)), desc = "Copy validation images", unit = "images"):
    filename = val1_df["filename"][i]
    path = val1_df['image_path'][i]
    img = tf.io.read_file(path)
    tf.io.write_file(tf.constant(VAL_FOLDER+filename), img)
    #augmentation(filename, val1_df['image_path'][i], VAL_FOLDER)
    
for i in tqdm(range(0,len(train1_df)), desc = "Augment train images", unit = "images"):
    filename = train1_df['filename'][i]
    augmentation(filename, train1_df['image_path'][i], TRAIN_FOLDER)

## update dataframe 

In [None]:
df_landmarks_augmented = pd.DataFrame.from_dict(landmarks).transpose()

## write new images into text file and read into panda dataframe

In [None]:
test_files = os.listdir(TEST_FOLDER)
val_files = os.listdir(VAL_FOLDER)
train_files = os.listdir(TRAIN_FOLDER)

f= open("test.txt","w+")
for i in range(len(test_files)):
    f.write(f"{test_files[i]};{test_files[i][3:6]};{TEST_FOLDER}{test_files[i]}\n")
f.close()

f= open("val.txt","w+")
for i in range(len(val_files)):
    f.write(f"{val_files[i]};{val_files[i][3:6]};{VAL_FOLDER}{val_files[i]}\n")
f.close()

f= open("train.txt","w+")
for i in range(len(train_files)):
    f.write(f"{train_files[i]};{train_files[i][3:6]};{TRAIN_FOLDER}{train_files[i]}\n")
f.close()

# Create a pandas dataframe from a tab separated file 
test2_df = pd.read_csv("test.txt", sep = ';', names = ['filename','classname','image_path'], encoding = 'unicode_escape')
test2_df = test2_df.set_index('filename')

val2_df = pd.read_csv("val.txt", sep = ';', names = ['filename','classname','image_path'], encoding = 'unicode_escape')
val2_df = val2_df.set_index('filename')

train2_df = pd.read_csv("train.txt", sep = ';', names = ['filename','classname', 'image_path'], encoding = 'unicode_escape')
train2_df = train2_df.set_index('filename')

num_test = test2_df.shape[0]
num_train = train2_df.shape[0]
num_val = val2_df.shape[0]

## update test, val and train dataframes

In [None]:
df_test = pd.merge(test1_df, test2_df, how = 'right', on = ['filename', 'classname'])
del df_test['image_path_x']
df_test.rename(columns={'image_path_y': 'image_path'}, inplace=True)

df_val = pd.merge(df_landmarks_augmented, val2_df, how = 'right', left_index=True, right_index=True)
# df_val = df_val.set_index('filename')
df_val['filename'] = df_val.index

df_train = pd.merge(df_landmarks_augmented, train2_df, how = 'right', left_index=True, right_index=True)
# df_train = df_train.set_index('filename')
df_train['filename'] = df_train.index

## add 'class' column

In [None]:
# associate class names with a class (0 .. K-1)
classnames = df_train['classname'].unique() # all 6 species names
K = classnames.size  # 6
name2class = dict(zip(classnames, range(K))) # dictionary that maps a name to its index in classnames array
print("names and classes:", name2class)

# Add a column 'class' to data frame  with the number representing the species name
df_test['class'] = df_test['classname'].map(name2class) # new column class with number representing plant name
# print a few random example lines
#print(df_test.sample(n=5))
# Add a column 'class' to data frame  with the number representing the species name
df_val['class'] = df_val['classname'].map(name2class) # new column class with number representing plant name
# print a few random example lines
#print(df_val.sample(n=5))
# Add a column 'class' to data frame  with the number representing the species name
df_train['class'] = df_train['classname'].map(name2class) # new column class with number representing plant name
# print a few random example lines
#print(df_train.sample(n=5))

# save dataframes for further use

In [None]:
df_test.to_pickle(TEST_PKL)
df_val.to_pickle(VAL_PKL)
df_train.to_pickle(TRAIN_PKL)