<a href="https://colab.research.google.com/github/MMoronto/ml-unstructured-data-projects/blob/master/Tile_based_classification_using_Sentinel_2_and_EuroSAT_data_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**Intro**

This workflow explores the process of training a `Convolutional Neural Network (CNN)` with Keras based on the benchmark dataset EuroSAT. This noote book contains notes I've taken as I work through the example workflow presented in the AI for Earth monitoring MOOC on Futurelearn.


##**Machine-Learning Algorithm**

This example develops a `Sequential Convolutional Neural Network (CNN)` with TF Keras.

##**Data**
The model is trained on the EuroSAT benchmark dataset which is based on Sentinel-2 satellite images and consists of 27,000 labeled and geo-referenced images.The dataset provides information on the following ten land cover/land use cases:
* `Annual Crop`
* `Forest`
* `Herbaceous Vegetation`
* `Highway`
* Industrial`
* `Pasture`
* `Permanent Crop`
* `Residential`
* `River`
* `Sea Lake`

The benchmark dataset can be used to detect `land cover / land use changes`. The geo-referenced datasset EuroSAT is publicly accessible here: https://github.com/phelber/eurosat

## Notebook Outline
* 1 - Load the EuroSAT benchmark dataset as input data
* 2 - Create training and test subsets from input data
* 3 - Define the Convolutional Neural Network architecture
* 4 - Fit (train) the convolutional neural network (CNN)
* 5 - Evaluate the performance of the CNN model with a confusion matrix


**Import Libraries**

In [None]:
## Begin S3FS Import Snippet ##
import os, sys
s3_home = os.getcwd()
try: sys.path.remove(s3_home) # Remove the S3 from the $path
except Exception: pass

current_dir = os.get_cwd()

os.chdir('/home/jovyan') # Temporarily move to another directory

# Begin imports #
from tensorflow import Keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import ModelCheckpoint

import glob

import numpy as np
import pandas as pd
import seaborn as sns

from osgeo import gdal_arrar
from matplotlib import pyplot as pyplot

from sklearn import metrics
from sklearn.model_selection import train_test_split

import zipfile
# end imports #

os.chdir(current_dir) # go back to your previous dir

sys.path.append(s3_home) # restore the s3 root in the $path

## end s3fs import snippet ##



In [2]:
# !unzip "drive/MyDrive/Land Classification/S2_Tile_based_classification.zip" -d "drive/MyDrive/Land Classification/"

Define helper functions

from_folder_to_stack

In [17]:
import numpy as np

'''
function name:
  from_folder_to_stack
description:
  This function transforms the .SAFE file into three different arrays (10m, 20m and 60m).
Input:
  safe_path: the path of the .SAFE file;
  data_bands_20m: if True, the fumnction computes stack using Sentinel2 band with 20m of pixel resolution (default=True);
  data_bands_60m: if True, the function computes stack using Sentinel2 band with 60m of pixel resolution (default=True);
Output:
  stack_10m: stack with the following S2L1C bands (B02, B03, B04, B08)
  stack_20m: stack with the following S2L1C bands (B05, B06, B07, B11, B12, B8A)
  stack_60m: stack with the following S2L1C bands (B01, mB09, B10)
'''
def from_folder_to_stack(
    safe_path,
    data_bands_20m=True,
    data_bands_60m=True,
    ):

  level_folder_name_list = glob.glob(safe_path + 'GRANULE/*')
  level_folder_name = level_folder_name_list[0]

  if level_folder_name.find("L2A") < 0:
    safe_path = [level_folder_name + '/IMG_DATA/']
  else:
    safe_path_10m = level_folder_name + '/IMG_DATA/R10m/'
    safe_path = [safe_path_10m]

  text_files = []

  for i in range(0, len(safe_path)):
      print("[AI4EO_MOOC]_log: Loading .jp2 images in %s" % (safe_path[i]))
      text_files_tmp = [f for f in os.listdir(safe_path[i]) if f.endswith('.jp2')]
      text_files.append(text_files_tmp)

  lst_stack_60m=[]
  lst_code_60m=[]
  lst_stack_20m=[]
  lst_code_20m=[]
  lst_stack_10m=[]
  lst_code_10m=[]
  for i in range(0, len(safe_path)):

    print("[AI4EO_MOOC]_log: Reading .jp2 files in %s" % (safe_path[i]))
    for name in range(0, len(text_files[i])):
      text_files_tmp = text_files[i]
      if data_bands_60m == True:
        cond_60m = ( (text_files_tmp[name].find("B01") > 0) or (text_files_tmp[name].find("B09") > 0)
                    or (text_files_tmp[name].find("B10") > 0))
        if cond_60m:
            print("[AI4EO_MOOC]_log: Using .jp2 image: %s" % text_files_tmp[name])
            lst_stack_60m.append(gdal_array.LoadFile(safe_path[i] + text_files_tmp[name]))
            lst_code_60m.append(text_files_tmp[name][24:26])

      if data_bands_20m == True:
          cond_20m = (text_files_tmp[name].find("B05") > 0) or (text_files_tmp[name].find("B06") > 0) or (
                      text_files_tmp[name].find("B07") > 0) or (text_files_tmp[name].find("B11") > 0) or (
                                  text_files_tmp[name].find("B12") > 0) or (text_files_tmp[name].find("B8A") > 0)
          cond_60m_L2 = (text_files_tmp[name].find("B05_60m") < 0) and (text_files_tmp[name].find("B06_60m") < 0) and (
                      text_files_tmp[name].find("B07_60m") < 0) and (text_files_tmp[name].find("B11_60m") < 0) and (
                                  text_files_tmp[name].find("B12_60m") < 0) and (text_files_tmp[name].find("B8A_60m") < 0)
          cond_20m_tot = cond_20m and cond_60m_L2
          if cond_20m_tot:
              print("[AI4E_MOOC]_log: Using .jp2 image: %s" % text_files_tmp[name])
              lst_stack_20m.append(gdal_array.LoadFile(safe_path[i] + text_files_tmp[name]))
              lst_code_20m.append(text_files_tmp[name][24:26])
      else:
        stack_20m = 0

      cond_10m = (text_files_tmp[name].find("B02") > 0) or (text_files_tmp[name].find("B03") > 0) or (
                  text_files_tmp[name].find("B04") > 0) or (text_files_tmp[name].find("B08") > 0)
      cond_20m_L2 = (text_files_tmp[name].find("B02_20m") < 0) and (text_files_tmp[name].find("B03_20m") < 0) and (
                  text_files_tmp[name].find("B04_20m") < 0) and (text_files_tmp[name].find("B08_20m") < 0)
      cond_60m_L2 = (text_files_tmp[name].find("B02_60m") < 0) and (text_files_tmp[name].find("B03_60m") < 0) and (
                  text_files_tmp[name].find("B04_60m") < 0) and (text_files_tmp[name].find("B08_60m") < 0)
      cond_10m_tot = cond_10m and cond_20m_L2 and cond_60m_L2

      if cond_10m_tot:
          print("[AI4E)_MOOC]_log: Using .jp2 image: %s" % text_files_tmp[name])
          lst_stack_10m.append(gdal_array.LoadFile(safe_path[i] + text_files_tmp[name]))
          lst_code_10m.append(text_files_tmp[name][24:26])


  stack_10m=np.asarray(lst_stack_10m)
  sorted_list_10m = ['02', '03', '04', '08']
  print('[AI4EO_MOOC]_log: Sorting stack 10m...')
  stack_10m_final_sorted = stack_sort(stack_10m, lst_code_10m, sorted_list_10m)

  stack_20m=np.asarray(lst_stack_20m)
  sorted_list_20m = ['05', '06', '07', '11', '12', '8A']
  print('[AI4EO_MOOC]_log: Sorting stack 20m...')
  stack_20m_final_sorted = stack_sort(stack_20m, lst_code_20m, sorted_list_20m)
              
  stack_60m=np.asarray(lst_stack_60m)
  sorted_list_60m = ['01', '09', '10']
  print('[AI4EO_MOOC]_log: Sorting stack 60m...')
  stack_60m_final_sorted = stack_sort(stack_60m, lst_code_60m, sorted_list_60m)

  return stack_10m_final_sorted, stack_20m_final_sorted, stack_60m_final_sorted

stack_sort

In [18]:
def stack_sort(stack_in, lst_code, sorted_list):
  b, r, c = stack_in.shape
  stack_sorted = np.zeros((r,c,b), dtype=np.unit16)

  len_list_bands = len(lst_code)

  c = np.zeros((len_list_bands), dtype=np.unit8)
  count = 0
  count_sort = 0
  while count_sort != len_list_bands:
    if lst_code[count] == sorted_list[count_sort]:
      c[count_sort] = count
      count_sort = count_sort + 1
      count = 0
    else:
      count = count + 1
    print('[AI4EO_MOOC]_log: sorted list:', sorted_list)
    print('[AI4EO_MOOC]_log: bands:', c)
    for i in range(0, len_list_bands):
        stack_sorted[:,:,i]=stack_in[c[i],:,:]
    
    return stack_sorted