This notebook repeats all the steps of `train_test_model.ipynb` in a cross-validation framework performed over the train dataset.

Before running this notebook it is necessary to split the train dataset TFRecord file into k partitions, follow the instructions in `split_train_set_for_cross_validation.ipynb` in the folder `convert_to_TFRecord`.

# Import required libraries and functions

Navigate to the working directory.

In [1]:
%cd
%cd hyperspectral-cnn-soil-estimation

/home/microsat
/home/microsat/hyperspectral-cnn-soil-estimation


Import libraries and set random seeds.

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import os, zipfile, logging
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import random as rn
rn.seed(2)
import numpy as np
np.random.seed(3)
SEED = 7231
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

#Disable GPU
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

#Enable full deterministic operations
#This options slows down the training process remarkably but it allows to get 
#deterministic results through different runs of the code
#note that results might differ on different hardware (cpu vs gpu) and on different computers
os.environ['TF_DETERMINISTIC_OPS'] = '1'

import tensorflow as tf
from dataset_processing import *
from efficientnet_lite import EfficientNetLiteB0mod
import tensorflow.keras.layers as layers

tf.random.set_seed(1)

AUTO = tf.data.AUTOTUNE 

# List dataset partitions

In [3]:
folder_path = 'dataset/'
file_list = []

for file in os.listdir(folder_path):
    if file.startswith("train_cv_split"):
        file_list.append(folder_path+file)

file_list=sorted(file_list)
print(file_list)

['dataset/train_cv_split_0.record', 'dataset/train_cv_split_1.record', 'dataset/train_cv_split_2.record', 'dataset/train_cv_split_3.record', 'dataset/train_cv_split_4.record']


# Training and inference pipeline

Define learning rate scheduling function.

In [4]:
def lr_scheduler(epoch, start_lr):
    if epoch < 1:
      lr = (start_lr-0.001)/1*epoch+0.001
    else:
      cosine_decay = 0.5 * (1 + tf.cos(np.pi * epoch / (decay_steps)))
      decayed = (1 - alpha) * cosine_decay + alpha
      lr = start_lr * decayed
    return lr

Define a custom metric reflecting the competition scoring.

In [5]:
def custom_metric(y_true, y_pred):
    y_true = tf.cond(tf.math.equal(label_normalization_mode,0), 
                     lambda: tf.multiply(y_true, max_labels),
                     lambda: tf.multiply(y_true, std_labels)+mean_labels)
    y_pred = tf.cond(tf.math.equal(label_normalization_mode,0), 
                     lambda: tf.multiply(y_pred, max_labels),
                     lambda: tf.multiply(y_pred, std_labels)+mean_labels)
    
    mse = tf.reduce_mean((y_true-y_pred)**2, axis=0)
    mse_baseline = [870.02899169921875, 3828.40234375, 1588.857421875, 0.0677162706851959228515625]
    score = tf.reduce_mean(mse/mse_baseline)

    return score

In [6]:
noise_std = 0.05
epochs = 300
batch_size = 32
target_image_size = 32
start_lr = 0.005 
networks_base_name = 'cv_test'
label_normalization_mode = 0 # 0 for minmax normalization, everything else fot standard normalization

###########################################################
max_reflectance = 6315
test_set_len = 1153
max_labels = [325., 625., 400., 14.]
std_labels = [29.496254, 61.874084, 39.860474, 0.2602235]
mean_labels = [70.30264, 227.98851, 159.28125, 6.782706]

#Build and compile the neural network once for all
backbone = EfficientNetLiteB0mod(input_shape=(target_image_size, target_image_size, 150),
                                   width_coefficient=0.5,
                                   depth_coefficient=0.5,
                                   dropout_rate=0.1)

model = tf.keras.Sequential([backbone,  
                            layers.Flatten(),
                            layers.Dense(4, kernel_initializer=tf.keras.initializers.GlorotUniform(seed=1509))])

model.compile(
    optimizer=tf.keras.optimizers.Adam(clipnorm=1.),
    loss='mse',
    metrics=[custom_metric],
  )

#Define final log path
output_csv_log = 'submission_files/full_cv_log_' + networks_base_name + '.csv'

i=-1 #Index to iterate over partitions

for file in file_list:
  #Reset global seeds
  rn.seed(2)
  np.random.seed(3)
  tf.random.set_seed(1)
  
  #Initialize the model with the same weights we employed during the competition
  model.load_weights('efficientnet_lite/initialization_weigths.h5')

  #At each iteration remove the i-th file from training file list and use it for testing
  i+=1
  train_list = file_list.copy()
  train_list.remove(file)
  test_list = file

  #Define train set cardinality
  train_set_len = len(list(load_tf_records(train_list)))
  num_train_images = (train_set_len//batch_size)*batch_size  

  #Define learning rate scheduling callback
  alpha = 0.0001
  decay_steps = epochs
  lr_callback = tf.keras.callbacks.LearningRateScheduler(lambda epoch: lr_scheduler(epoch, start_lr), verbose=0)

  #Index i denotes the partition used for testing (and excluded from training)
  #Therefore cv_test_1 will refer to the model trained on partitions [0,2,..,k] and tested on partition 1
  network_name = networks_base_name + str(i) 
  saved_model_filepath = 'networks/' + network_name 
  h5_filepath = saved_model_filepath + '/'+network_name+'.h5'
  inference_name = 'submission_files/inference_' + network_name + '.csv'

  #Dataset preprocessing
  steps_per_epoch = num_train_images//batch_size

  ds = load_tf_records(train_list).map(decode_dataset_train_val, num_parallel_calls=AUTO)
  ds = ds.shuffle(train_set_len, seed=1860)

  #Train
  train_data = ds.map(lambda patch, label, height, width: normalize_train_val(patch, label, height, width, max_reflectance,  max_labels, mean_labels, std_labels, label_normalization_mode), num_parallel_calls=AUTO,deterministic=True).cache()    #normalize train dataset
  train_data = train_data.shuffle(num_train_images, seed=1866)    #shuffle train dataset
  train_data = train_data.map(lambda patch, label, height, width: augment_train(patch, label, height, width,target_image_size,noise_std), num_parallel_calls=AUTO,deterministic=True)    
  train_data = train_data.batch(batch_size=batch_size, drop_remainder=True)    #batch train dataset
  train_data = train_data.prefetch(AUTO)    #prefetch train dataset

  #Train the network
  print('Training model', network_name)
  History = model.fit(train_data,
                    epochs=epochs,
                    callbacks=[lr_callback],
                    verbose=0
                    )

  #Export trained model
  model.save(saved_model_filepath)
  model.save(h5_filepath)

  #Perform inference on holdout set
  test_data = load_tf_records(test_list).map(decode_dataset_test, num_parallel_calls=AUTO)

  test_data = test_data.map(lambda filename, image, height, width: normalize_test(filename, image, height, width, max_reflectance), num_parallel_calls=AUTO)    #normalize test dataset
  test_data = test_data.map(lambda filename, image, height, width: preprocess_test(filename, image, height, width,target_image_size), num_parallel_calls=AUTO).batch(1).prefetch(AUTO)    #batch and prefetch test dataset

  filenames = np.array([],dtype=int)
    
  export_predictions_and_size =  []
  print()
  print('Testing model', network_name)
  print()

  for filename, image, height, width in test_data:
    filenames = np.append(filenames, int(filename.numpy()[0].decode().replace(".npz", "")))
    predictions = model.predict(image,verbose=0)
    
    if label_normalization_mode == 0:
        
        predictions *= max_labels
    else:
        predictions  = (predictions*std_labels)+mean_labels
        
    export_predictions_and_size.append(np.concatenate((predictions.reshape([-1]), height.numpy(), width.numpy())))

  partition_log = pd.DataFrame(data=export_predictions_and_size, columns=["P", "K", "Mg", "pH","height","width"])
  partition_log.index = filenames
  partition_log.to_csv(inference_name, index_label="sample_index")

#When training is complete merge all csv files to a single one
folder_path = 'submission_files/'
log_list = []

for file in os.listdir(folder_path):
    if file.startswith('inference_' + networks_base_name):
        log_list.append(folder_path+file)

log_list=sorted(log_list)

merged_data = pd.DataFrame()
for file in log_list:
    data = pd.read_csv(file)
    merged_data = pd.concat([merged_data, data])

sorted_data=merged_data.sort_values(by=merged_data.columns[0])
sorted_data.to_csv(output_csv_log, index=False)
print('Cross validation completed')
print('Full log saved in: ', output_csv_log)

Training model cv_test0

Testing model cv_test0

Training model cv_test1

Testing model cv_test1

Training model cv_test2

Testing model cv_test2

Training model cv_test3

Testing model cv_test3

Training model cv_test4

Testing model cv_test4

Cross validation completed
Full log saved in:  submission_files/full_cv_log_cv_test.csv
