# Single Base Training on GPU1

# Quick Links <a name = "Top"></a>
<ol>
    <li><p><a href = #setup>Set Up</a></p></li>
    <li><p><a href = #Base>Base Training</a></p></li>
</ol>

# Imports

In [1]:
import os
import shutil

print('Current Conda Environment: {}'.format(os.environ['CONDA_DEFAULT_ENV']))

Current Conda Environment: tf_ks


In [2]:
#pip install talos

In [3]:
import talos as ta
from talos.model import lr_normalizer, early_stopper, hidden_layers

import tensorflow as tf
  
available_gpus = tf.config.experimental.list_physical_devices('GPU')
built_with_cuda = tf.test.is_built_with_cuda()

if not (not available_gpus) & built_with_cuda:
    print("The installed version of TensorFlow {} includes GPU support.\n".format(tf.__version__))
    print("Num GPUs Available: ", len(available_gpus), "\n")
else:
    print("The installed version of TensorFlow {} does not include GPU support.\n".format(tf.__version__))
    
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

from tensorflow.compat.v1.keras import callbacks, backend as K
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16
from tensorflow.keras.utils import multi_gpu_model
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.optimizers import Adam, Nadam, RMSprop, SGD, Adagrad
from tensorflow.keras.layers import ReLU, LeakyReLU

from datetime import datetime
import pandas as pd
import numpy as np

import time

from numpy.random import seed
seed(1)
tf.random.set_seed(1)

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth=True
config.gpu_options.per_process_gpu_memory_fraction = 0.99
sess = tf.compat.v1.Session(config = config)
K.set_session(sess)

Using TensorFlow backend.


The installed version of TensorFlow 2.1.0 includes GPU support.

Num GPUs Available:  2 

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 2147201031999958969
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 9105744200
locality {
  bus_id: 1
  links {
  }
}
incarnation: 14094147675888626701
physical_device_desc: "device: 0, name: GeForce RTX 2080 Ti, pci bus id: 0000:17:00.0, compute capability: 7.5"
, name: "/device:GPU:1"
device_type: "GPU"
memory_limit: 9104897474
locality {
  bus_id: 1
  links {
  }
}
incarnation: 13115759927474630737
physical_device_desc: "device: 1, name: GeForce RTX 2080 Ti, pci bus id: 0000:65:00.0, compute capability: 7.5"
]


# Hilfsfunktionen

### Output Directory

* <i>SSD</i>, falls genug Speicher auf SSD im SymLink <i>fast_output</i> verfügbar ist
* <i>HDD</i>, falls möglicherweise zu wenig SSD-Speicher verfügbar ist $\rightarrow$ <i>output</i>

In [4]:
from enum import IntEnum

class OutputDirectory(IntEnum):
    HDD = 0
    SSD = 1
    
output_path = ['output', 'fast_output']

### Convert Label_Type into suitable label names.
$\Rightarrow$ Angular / Normalized $\rightarrow$ ['Elevation', 'Azimuth']

$\Rightarrow$ Stereographic $\rightarrow$ ['S_x', 'S_y']

In [5]:
def get_Label_Names(label_type):
    if label_type == 'Angular' or label_type == 'Normalized':
        return ['Elevation', 'Azimuth']
    elif label_type == 'Stereographic':
        return ['S_x', 'S_y']
    else:
        assert(True, 'LabelType Invalid')
        return None

### Benutzerdefinierte Kostenfunktion & Metrik

In [6]:
def circular_mse(y_true, y_pred):
    max_error = tf.constant(360, dtype = 'float32')
    return K.mean(K.square(K.minimum(K.abs(y_pred - y_true), max_error - K.abs(y_pred - y_true))), axis = -1)

def circular_mae(y_true, y_pred):
    max_error = tf.constant(360, dtype = 'float32')
    return K.mean(K.minimum(K.abs(y_pred - y_true), K.abs(max_error - K.abs(y_pred - y_true))), axis = -1)

def custom_mae(y_true, y_pred):
    return K.mean(K.abs(y_pred - y_true), axis = -1)

### Convert String into Reduction Metric Function

In [7]:
def get_Reduction_Metric(metric):
    
    if metric == 'custom_mae':
        return [custom_mae]
    elif metric == 'tf.keras.metrics.MeanAbsoluteError()':
        return [tf.keras.metrics.MeanAbsoluteError()]
    elif metric == 'circular_mae':
        return [circular_mae]
    elif metric == 'mean_squared_error':
        return ['mean_squared_error']
    else:
        assert(False, 'Metric yet unknown - Please modify get_Reduction_Metric to meet your requirements')
        return None

### Generierung Bottleneck-Features

In [8]:
def create_bottleneck_features(train_generator, valid_generator, bottleneck):
        
    print('Creating bottleneck features...')
    model = VGG16(include_top = False, weights = 'imagenet')
        
    #bottleneck_features_train = model.predict_generator(train_generator, train_generator.n // train_generator.batch_size)
    bottleneck_features_train = model.predict(train_generator)
    np.save(open(_LOG_DIR + 'Train_' + bottleneck, 'wb'), bottleneck_features_train)
        
    #bottleneck_features_valid = model.predict_generator(valid_generator, valid_generator.n // valid_generator.batch_size)
    bottleneck_features_valid = model.predict(valid_generator)
    np.save(open(_LOG_DIR + 'Valid_'  + bottleneck, 'wb'), bottleneck_features_valid)

### Generierung Datenpipeline (Angepasst für Talos)

In [9]:
def prepare_data(batch_size, num_samples, label_type):
    print(_CSV_FILE)
    df = pd.read_csv(_CSV_FILE)
    df_shuffled = df.sample(frac = 1, random_state = 1)
    df_train = df_shuffled[0 : int(num_samples * 0.8 // batch_size * batch_size)]   
    df_valid = df_shuffled.drop(df_shuffled.index[0: df_train.shape[0]])[0 : int(num_samples * 0.2 // batch_size * batch_size)]
    
    train_labels = df_train.drop(['Filename RGB'], axis = 1).values
    valid_labels = df_valid.drop(['Filename RGB'], axis = 1).values
    
    bottleneck = 'Bottleneck_Features_{}_{}.npy'.format(str(num_samples), str(batch_size))                                       
    if not os.path.exists(_LOG_DIR + 'Train_' + bottleneck):
        
        if _USE_DATA_AUGMENTATION:
            train_data_generator = ImageDataGenerator(
                rescale = 1./255, 
                width_shift_range = 0.1,
                height_shift_range = 0.1, 
                zoom_range = 0.1,
                brightness_range = (0.25, 0.75),
                fill_mode = 'nearest'
            )
        else:
            train_data_generator = ImageDataGenerator(
                rescale = 1./255
            )
            
        valid_data_generator = ImageDataGenerator(
            rescale = 1./255
        )
    
        print('Y-Col: {}'.format(get_Label_Names(label_type)))
        
        train_generator = train_data_generator.flow_from_dataframe(
            dataframe = df_train,
            directory = _IMAGE_DIR,
            x_col = 'Filename RGB',
            y_col = get_Label_Names(label_type),
            class_mode = 'raw',
            target_size = (224, 224),
            color_mode = 'rgb',
            shuffle = False,
            seed = 1,
            batch_size = batch_size
        )
    
        valid_generator = valid_data_generator.flow_from_dataframe(
            dataframe = df_valid,
            directory = _IMAGE_DIR,
            x_col = 'Filename RGB',
            y_col = get_Label_Names(label_type),
            class_mode = 'raw',
            target_size = (224, 224),
            color_mode = 'rgb',
            shuffle = False,
            seed = 1,
            batch_size = batch_size
        )
                                                        
        create_bottleneck_features(train_generator, valid_generator, bottleneck)
        
    train_features = np.load(open(_LOG_DIR + 'Train_' + bottleneck, 'rb'))
    valid_features = np.load(open(_LOG_DIR + 'Valid_' + bottleneck, 'rb'))
                                                        
    return train_labels, valid_labels, train_features, valid_features

### Generierung Modell (Angepasst für Talos)

In [10]:
def grid_model_base(x, y, x_val, y_val, params):
    
    global _COUNTER    
    K.clear_session()

    train_labels, valid_labels, train_features, valid_features = prepare_data(params['batch_size'], params['samples'], params['label_type'])

    model = Sequential()
    model.add(Flatten(input_shape = train_features.shape[1 :])) # (7, 7, 512)
    
    dropout_rate = params['dropout']
    first_neuron = params['first_neuron']
    
    if params['activation'] == 'leakyrelu':
        activation_layer = LeakyReLU(alpha = 0.1)
    elif params['activation'] == 'relu':
        activation_layer = ReLU()  
    
    model.add(Dense(units = first_neuron, kernel_initializer = glorot_uniform(seed = 1)))
    model.add(activation_layer)
    if dropout_rate > 0.0:
        model.add(Dropout(rate = dropout_rate))
        
    hidden_neuron_fraction = first_neuron
    for i in range(params['hidden_layers']):
        hidden_neuron_fraction = hidden_neuron_fraction // 2
        model.add(Dense(units = hidden_neuron_fraction, kernel_initializer = glorot_uniform(seed = 1)))
        model.add(activation_layer)
        if dropout_rate > 0.0:
            model.add(Dropout(rate = dropout_rate))
    
    model.add(Dense(units = 2, kernel_initializer = glorot_uniform(seed = 1)))

    print('Using Loss: {} \nand Reduction Metric: {}'.format(
        params['loss_function'], 
        get_Reduction_Metric(params['reduction_metric'])))
    
    model.compile(
        optimizer = params['optimizer'](lr = lr_normalizer(params['lr'], params['optimizer'])), 
        loss = params['loss_function'], 
        metrics = get_Reduction_Metric(params['reduction_metric'])
    )
    
    print('Monitor: {}'.format(params['monitor_value']))
    
    checkpointer = callbacks.ModelCheckpoint(
        filepath = _LOG_DIR + 'Best_Weights_FC_{}.hdf5'.format(_COUNTER),
        monitor = params['monitor_value'],
        verbose = 1,
        save_best_only = True,
        mode = 'min'
    )
    
    startTime = datetime.now()
    out = model.fit(
        x = train_features,
        y = train_labels,
        epochs = params['epochs'],
        validation_data = (valid_features, valid_labels),
        steps_per_epoch = int(params['samples'] * 0.8 // params['batch_size']),
        validation_steps = int(params['samples'] * 0.2 // params['batch_size']),
        callbacks = [checkpointer]
    )
    print("Time taken:", datetime.now() - startTime)
    
    _COUNTER = _COUNTER + 1
     
    return out, model

# Parameter <a name = "setup"></a><a href = #Top>Up</a></p>

### GridSerach

#### Hyper Parameter

In [11]:
#     Adam = RMSprop + Momentum (lr=0.001)
#     Nadam = Adam RMSprop + Nesterov-Momentum (lr=0.002)
#     RMSprop = (lr=0.001)
#     SGD = (lr=0.01)
#     Adagrad

hyper_parameter = {
    'samples': [20000],
    'epochs': [1],
    'batch_size': [32, 64],
    'optimizer': [Adam],
    'lr': [1, 2, 5],
    'first_neuron': [1024, 2048, 4096],
    'dropout': [0.25, 0.50],
    'activation': ['leakyrelu', 'relu'],
    'hidden_layers': [0, 1, 2, 3, 4],
    # beginning from here, Values should only contain a single entry:
    # ===============================================================
    'label_type': ['Angular'], # Stereographic, Angular, Normalized
    'loss_function': ['mean_squared_error'], # circular_mse
    'reduction_metric': ['custom_mae'], # tf.keras.metrics.MeanAbsoluteError(), circular_mae, custom_mae, mean_squared_error
    'monitor_value': ['val_custom_mae'] # val_custom_mae, val_mean_absolute_error, val_circular_mae, val_loss
}

In [12]:
def get_params():  
    return p

### Dateisystem

In [13]:
_RUN = 'SYNTH'
_LOSS = 'MSE'
_DATASET_NAME = '201019_2253_final'#'2020-05-28'
_DEVICE = 'GeForce_RTX_2080_Ti' #"/device:GPU:1" #'TITAN_GPU1'

storage = OutputDirectory.SSD # 'fast_output' if ssd storage may suffice, 'output' otherwise

#APPENDIX = 'Stereographic'

#FUNCTION_OVERRIDE = ['mean_squared_error', [custom_mae], 'val_custom_mae'] # None, or e. g. ['mean_squared_error', [circular_mae], 'val_circular_mae']

if hyper_parameter['label_type'][0] == 'Stereographic':
    _CSV_FILE_NAME = 'images_synthetisch_stereographic.csv'
    _STEREOGRAPHIC = True
elif hyper_parameter['label_type'][0] == 'Angular':
    _CSV_FILE_NAME = 'labels_ks.csv' #'images_synthetisch.csv'
    _STEREOGRAPHIC = False
elif hyper_parameter['label_type'][0] == 'Normalized':
    _CSV_FILE_NAME = 'images_synthetisch_normalized.csv'
    _STEREOGRAPHIC = False
else:
    assert(True, 'LabelType Invalid')

In [14]:
_USE_DATA_AUGMENTATION = False

In [15]:
_MODEL_NAME = '{}_Optimierung_Hyperparameter_v{}'.format(_DATASET_NAME, _RUN)
_IMAGE_DIR = '..\\..\\data_generation\\dataset\\{}\\'.format(_DATASET_NAME) # '..\\dataset\\{}\\'.format(_DATASET_NAME)
_CSV_FILE = _IMAGE_DIR + _CSV_FILE_NAME

_COUNTER = 0

_note = '_Custom-MAE'

_NET_DIR = '{}_Regression_{}\\{}_{}_Base{}\\'.format(_RUN, _LOSS, _DATASET_NAME, hyper_parameter['label_type'][0], _note)
_LOG_DIR = '..\\{}\\{}'.format(output_path[storage], _NET_DIR)

_RESULTS_FILE = '\\..\\{}_{}_Base{}_Results.csv'.format(_DATASET_NAME, hyper_parameter['label_type'][0], _note)

if(not os.path.exists(_LOG_DIR)):
    os.makedirs(_LOG_DIR)
else:
    input('Directory >>| {} |<< existiert bereits. Fortsetzen auf eigene Gefahr! (Weiter mit Enter)'.format(_LOG_DIR))

device_file = open(_LOG_DIR + '{}'.format(_DEVICE), "a+")
device_file.close()

# Ausführung GridSearch mit Talos <a name = "Base"></a><a href = #Top>Up</a></p>

In [16]:
import numpy as np
dummy_x = np.empty((1, 2, 3, 224, 224))
dummy_y = np.empty((1, 2))

with tf.device('/device:GPU:1'):
    
        t = ta.Scan(
            x = dummy_x,
            y = dummy_y,
            model = grid_model_base,
            params = hyper_parameter,
            experiment_name = '{}'.format(_DATASET_NAME),
            #shuffle=False,
            reduction_metric = hyper_parameter['reduction_metric'][0],
            disable_progress_bar = False,
            print_params = True,
            clear_session = True,
            save_weights = False
        )
        

t.data.to_csv(_LOG_DIR + _RESULTS_FILE, index = True)

  0%|                                                                                          | 0/360 [00:00<?, ?it/s]

{'activation': 'leakyrelu', 'batch_size': 32, 'dropout': 0.25, 'epochs': 1, 'first_neuron': 1024, 'hidden_layers': 0, 'label_type': 'Angular', 'loss_function': 'mean_squared_error', 'lr': 1, 'monitor_value': 'val_custom_mae', 'optimizer': <class 'tensorflow.python.keras.optimizer_v2.adam.Adam'>, 'reduction_metric': 'custom_mae', 'samples': 20000}
..\..\data_generation\dataset\201019_2253_final\labels_ks.csv
Y-Col: ['Elevation', 'Azimuth']
Found 16000 validated image filenames.
Found 4000 validated image filenames.
Creating bottleneck features...


UnknownError:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node vgg16/block1_conv1/Conv2D (defined at <ipython-input-8-4e2f6440d433>:7) ]] [Op:__inference_distributed_function_548]

Function call stack:
distributed_function


# Copy best 10 Results to NAS if SSD Directory was selected

In [None]:
best_results = _LOG_DIR + '..\\{}_{}_Base{}_Results.csv'.format(_DATASET_NAME, hyper_parameter['label_type'][0], _note)
df = pd.read_csv(best_results).drop(columns = ['round_epochs', 'samples', 'epochs'], axis = 0)
df = df.sort_values(hyper_parameter['monitor_value'][0], axis = 0, ascending = True, inplace = False, kind = 'quicksort', na_position = 'last')
print('Displaying: {}'.format(best_results))
df.head(10)

In [None]:
def copy_base_directory(src, dst, data, symlinks = False, ignore = None):
    maxLen = 0
    message = ''
    
    networks_to_copy = data.head(10).index
    
    if not os.path.exists(dst):
        
        message = 'Creating Path: {}'.format(src)
        maxLen = max(maxLen, len(message))
        print(message + ' ' * (maxLen - len(message)), end = '\r')
        
        os.makedirs(dst)
        
    for item in os.listdir(src):
        
        s = os.path.join(src, item)
        d = os.path.join(dst, item)
        
        if os.path.isdir(s):
            
            message = 'Copying Directory: {}'.format(s)
            maxLen = max(maxLen, len(message))
            print(message + ' ' * (maxLen - len(message)), end = '\r')
            
            shutil.copytree(s, d, symlinks, ignore)
            
        else:
            
            if not os.path.exists(d): #or os.stat(s).st_mtime - os.stat(d).st_mtime > 1:
                for idx in networks_to_copy:
                    net = '_FC_{}.hdf5'.format(idx)
                    if net in item or '_Bottleneck_' in item or '_GPU' in item:
                        message = 'Copying File: {}'.format(s)
                        maxLen = max(maxLen, len(message))
                        print(message + ' ' * (maxLen - len(message)), end = '\r')
                
                        shutil.copy2(s, d)
        
        time.sleep(.1)    
    
    src_csv = src + _RESULTS_FILE
    dst_csv = dst + _RESULTS_FILE
    if not os.path.exists(dst_csv):
        message = 'Copying File: {}'.format(src_csv)
        maxLen = max(maxLen, len(message))
        print(message + ' ' * (maxLen - len(message)), end = '\r')
        shutil.copy2(src_csv, dst_csv)
     
    message = 'Coyping... Done'
    maxLen = max(maxLen, len(message))
    print(message + ' ' * (maxLen - len(message)), end = '\n')

    
def delete_directory(src, terminator = '\n'):
    message = ''
    maxLen = 0
    
    try:
        message = 'Deleting {}'.format(src)
        maxLen = max(maxLen, len(message))
        print(message + ' ' * (maxLen - len(message)), end = '\r')
        
        if os.path.isdir(src):
            shutil.rmtree(src)
            
        elif os.path.isfile(src):
            os.remove(src)
        
    except OSError as e:
        message = 'Error: {} : {}'.format(src, e.strerror)
        maxLen = max(maxLen, len(message))
        print(message + ' ' * (maxLen - len(message)), end = '\n')
        return
    
    message = 'Deleting... Done'
    maxLen = max(maxLen, len(message))
    print(message + ' ' * (maxLen - len(message)), end = terminator)

    
def copy_base_training(src, dst):
    copy_base_directory(src, dst, df)
    
    delete_directory(src, terminator = '\r')
    delete_directory(src + _RESULTS_FILE, terminator = '\r')
    if not os.listdir(src + '..\\'):
        delete_directory(src + '..\\', terminator = '\r')

In [None]:
if(storage == OutputDirectory.SSD):
    _COPY_DIR = '..\\output\\{}'.format(_NET_DIR)
    copy_base_training(_LOG_DIR, _COPY_DIR)