In [None]:
#This script uses over 300 GB of RAM

#loading and setting up the data (if already downloaded) takes over 6 hours
#We then use leave-one-region-out cross-validation, and train 28 models
#Each model takes about 10 hours to train using 1 gpu

#overall runtime for 1 gpu= 6 + 28*10 = 286 = 12 days
#overall runtime for 4 gpu=s 6 + 28*3 = 90 ~ 4 days

In [None]:
#NOTE!
#this notebook assumes Notebooks/DownloadHiRISE/DownloadHiRISE.ipynb has already been run to download the raw data

In [None]:
import os
import time
import numpy as np
import pandas as pd
import tensorflow
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, History
print("tensorflow version = ",tensorflow.__version__)

from p4tools import io #from https://github.com/michaelaye/p4tools 
from P4_DataHandlers import Load_all_HiRISE_images_crop_and_convert_to_8bits,CreateListOfThreeScales,CreateFullImageMasks
from SemanticSegmentation import model_unet_P4,model_hrnet_P4,P4_SegmenterDataGenerator
from SemanticSegmentation import loss_jaccard_coef_categorical_one_minus,metric_recall,metric_precision,metric_jaccard_coef_categorical_int

#select a GPU and set up multi-GPU training
os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
os.environ["CUDA_VISIBLE_DEVICES"] = '0,1,2,3'
gpus = tensorflow.config.experimental.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tensorflow.config.experimental.set_memory_growth(gpu, True)
strategy = tensorflow.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

#recommended path settings:
base_folder = '../../'
data_folder = base_folder+'Data/Images/HiRISE/'
DoScaleAug = True #set this to false unless you have 400 GB of RAM!


In [None]:
SaveFolder = base_folder+'/Data/Images/HiRISE_8bit_and_P4_mask/'
Preload=True
if Preload==False:
    #load all the images and convert from 10 bits maximum bit depth per channel to 8. Then crop to the labelled region
    #Takes about 70 GB of RAM
    t0 = time.time()
    ImageAndMasksList,OBSID_List = Load_all_HiRISE_images_crop_and_convert_to_8bits(data_folder) #result has 3 channels (RGB)
    t1 = time.time()
    print('Number of images loaded = ',len(ImageAndMasksList))
    print('time to load and convert all images = ',t1-t0) #takes about 2 hours
    #create "ground truth" segmentation masks.
    t0 = time.time()
    ImageAndMasksList =  CreateFullImageMasks(ImageAndMasksList) # Result has 4 channels ( RGB and the 4th dimension in the channels axis is the masks)
    t1 = time.time()
    print('time to create all masks = ',t1-t0)#takes 2-3 hours
    if not os.path.exists(SaveFolder):
        os.makedirs(SaveFolder)  
    for i in range(len(ImageAndMasksList)):
        obs_id = OBSID_List[i]
        np.save(SaveFolder+obs_id+'.npy', ImageAndMasksList[i])
else:
    t0 = time.time()
    #assumes the first if branch above has already been run
    metadata_df = io.get_meta_data()
    ImageAndMasksList=[]
    OBSID_List=[]
    for index, row in metadata_df.iterrows():
        FileName = row['OBSERVATION_ID']+'_RGB.NOMAP.JP2'
        OBSID_List.append( row['OBSERVATION_ID'])
        ImageAndMasksList.append(np.load(SaveFolder+row['OBSERVATION_ID']+'.npy'))
    t1 = time.time()
    print('time to load pre-converted images = ',t1-t0)  #takes less than 5 minutes
        
#prepare data for training models (create 3 scales for each, to speed up training. Uses lots of RAM)
DoScaleAug=True
if DoScaleAug == True:
    t0 = time.time()
    ImageAndMasksLists = CreateListOfThreeScales(ImageAndMasksList) 
    t1 = time.time()
    print("time to createlist of three scales for scale augmentation = ",t1-t0) #takes 1.5-2 hours
else:
    ImageAndMasksLists=[ImageAndMasksList]

In [None]:
#params
num_classes=2
num_channels_in_input=3
base_filters=16
patch_height = 512
patch_width = 512
init_lr = 0.03
epochs = 80
FirstEpochStep = 40
SecondEpochStep = 30
wd=1e-4
batch_size = 5*strategy.num_replicas_in_sync

#output paths
TrainingResultsPath='../../Data/TrainingResults/'
if not os.path.isdir(TrainingResultsPath):
    os.mkdir(TrainingResultsPath)
TrainedModelsPath='../../Data/Models/SemanticSegmenterCNN/'
if not os.path.isdir(TrainedModelsPath):
    os.mkdir(TrainedModelsPath)

#learning rate schedule
def lr_schedule(epoch):
    lr = init_lr
    if epoch >= SecondEpochStep:
        lr *= 1e-2
    elif epoch >= FirstEpochStep:
        lr *= 1e-1
    print('Learning rate: ', lr)
    return lr

#add some missing region names
metadata_df = io.get_meta_data()
region_names_df = io.get_region_names()
region_names_df = region_names_df.set_index('obsid')
region_names_df.at['ESP_012620_0975','roi_name'] = 'Buffalo'
region_names_df.at['ESP_012277_0975','roi_name'] = 'Buffalo'
region_names_df.at['ESP_012348_0975','roi_name'] = 'Taichung'
for index, row in metadata_df.iterrows():
    roi_name = region_names_df.at[row['OBSERVATION_ID'],'roi_name']
    metadata_df.at[index,'roi_name']=roi_name

#loop over each region, and leave it out of training for a model. Use the left out region for val
UniqueP4Regions = metadata_df['roi_name'].unique()
ModelCount = 0
for ToLeaveOut in UniqueP4Regions:
    #select data for training
    LeaveOneRegionOutList = [n for n in UniqueP4Regions if n != ToLeaveOut]
    print('Training model ',str(ModelCount+1),' of ',str(len(UniqueP4Regions)),'; leaving out region: ',ToLeaveOut)
    TrainImages_df = metadata_df[metadata_df['roi_name'] != ToLeaveOut]
    if DoScaleAug==True:
        TrainImagesList1 = [ImageAndMasksLists[0][i] for i in  TrainImages_df.index.values]
        TrainImagesList2 = [ImageAndMasksLists[1][i] for i in  TrainImages_df.index.values]
        TrainImagesList3 = [ImageAndMasksLists[2][i] for i in  TrainImages_df.index.values]
        TrainImagesList = [TrainImagesList1,TrainImagesList2,TrainImagesList3]
    else:
        TrainImagesList = [[ImageAndMasksLists[0][i] for i in  TrainImages_df.index.values]]
    train_data_generator = P4_SegmenterDataGenerator(TrainImagesList,batch_size,patch_height,patch_width)
    lr_scheduler = LearningRateScheduler(lr_schedule)
    
    #define the model
    with strategy.scope():
        model = model_hrnet_P4(num_channels_in_input,num_classes,base_filters,wd)
        model.compile(loss=loss_jaccard_coef_categorical_one_minus,
                  optimizer =SGD(lr=init_lr,decay=0, momentum=0.9, nesterov=False),
                  metrics=[metric_precision,metric_recall,metric_jaccard_coef_categorical_int])
    #fit the model
    history = model.fit(train_data_generator,
                              epochs=epochs,
                              verbose=1, 
                              workers=4,
                              callbacks=[lr_scheduler,History()],
                              max_queue_size=4
                             )
    
    #save outputs: model and training stats
    TrainingInfo_df=pd.DataFrame.from_dict(model.history.history)
    csv_file_name=TrainingResultsPath+'Training_HiRISE_segmenter_leave_out_' + ToLeaveOut+'_finished.csv'
    TrainingInfo_df.to_csv(csv_file_name)

    #save final model. Due to using parallel GPUs, much faster to save the weights and define a new model on CPU
    temp_file_name=TrainedModelsPath+'HiRISE_segmenter_leave_out_'+ToLeaveOut+'final_model_weights_only_temp.h5'
    model.save_weights(temp_file_name)
    inference_model=model_hrnet_P4(num_channels_in_input,num_classes,base_filters,wd)
    inference_model.load_weights(temp_file_name)
    final_model_name=TrainedModelsPath+'HiRISE_segmenter_leave_out_'+ToLeaveOut+'final.h5'
    inference_model.save(final_model_name,include_optimizer=False) 
    os.remove(temp_file_name)
    
    ModelCount=ModelCount+1