##  Leave one out spatial cross validation for HM2

## 1. Load the libraries for calculation

In [1]:
# import the libraries
import ee
import pandas as pd
import os
import numpy as np
import random
from random import sample
import itertools 
import geopandas as gpd
from sklearn.metrics import r2_score
from termcolor import colored # this is allocate colour and fonts type for the print title and text
from IPython.display import display, HTML

In [2]:
#set the working directory of local drive for Grid search result table loading
# os.getcwd()

In [3]:
# Intialize the ee API connection
ee.Initialize()

## 2. Prepare the composite for calculation

In [4]:
# Define the vectors of predictors
predictorVector = ['Aridity_Index',
                  'CHELSA_Annual_Mean_Temperature',
                  'CHELSA_Annual_Precipitation',
                  'CHELSA_Isothermality',
                  'CHELSA_Max_Temperature_of_Warmest_Month',
                  'CHELSA_Mean_Diurnal_Range',
                  'CHELSA_Mean_Temperature_of_Coldest_Quarter',
                  'CHELSA_Mean_Temperature_of_Driest_Quarter',
                  'CHELSA_Mean_Temperature_of_Warmest_Quarter',
                  'CHELSA_Mean_Temperature_of_Wettest_Quarter',
                  'CHELSA_Min_Temperature_of_Coldest_Month',
                  'CHELSA_Precipitation_Seasonality',
                  'CHELSA_Precipitation_of_Coldest_Quarter',
                  'CHELSA_Precipitation_of_Driest_Month',
                  'CHELSA_Precipitation_of_Driest_Quarter',
                  'CHELSA_Precipitation_of_Warmest_Quarter',
                  'CHELSA_Precipitation_of_Wettest_Month',
                  'CHELSA_Precipitation_of_Wettest_Quarter',
                  'CHELSA_Temperature_Annual_Range',
                  'CHELSA_Temperature_Seasonality',
                  'Depth_to_Water_Table',
                  'EarthEnvTopoMed_Eastness',
                  'EarthEnvTopoMed_Elevation',
                  'EarthEnvTopoMed_Northness',
                  'EarthEnvTopoMed_ProfileCurvature',
                  'EarthEnvTopoMed_Roughness',
                  'EarthEnvTopoMed_Slope',
                  'SG_Absolute_depth_to_bedrock',
                  'WorldClim2_SolarRadiation_AnnualMean',
                  'WorldClim2_WindSpeed_AnnualMean',
                  'EarthEnvCloudCover_MODCF_interannualSD',
                  'EarthEnvCloudCover_MODCF_intraannualSD',
                  'EarthEnvCloudCover_MODCF_meanannual',
                  'EarthEnvTopoMed_AspectCosine',
                  'EarthEnvTopoMed_AspectSine',
                  'SG_Clay_Content_0_100cm',
                  'SG_Coarse_fragments_0_100cm',
                  'SG_Sand_Content_0_100cm',
                  'SG_Silt_Content_0_100cm',
                  'SG_Soil_pH_H2O_0_100cm',
                  'PresentTreeCover']
# define the dependent variable
varToModel = 'SpawnDensity'

## 3. Spatial cross validation for each Biome

### 3.1 define the functions needed for spatial CV

In [5]:
# Define list contains the buffer sizes to test
buffer_sizes = 550000 # 550km

# define the core function for spatial cross validation
#  Blocked Leave One Out cross-validation function:
def BLOOcv(f):
    rep = f.get('rep')
    # Test feature
    testFC = ee.FeatureCollection(f)

    # Training set: all samples not within geometry of test feature
    trainFC = perBootstrapTable.filter(ee.Filter.geometry(testFC).Not())

    # Classifier to test
    classifier = ee.Classifier.smileRandomForest(
        numberOfTrees=200,
        variablesPerSplit = variablesPerSplitVal,
        minLeafPopulation = minLeafPopulationVal,
        maxNodes = maxNodesVal,
        bagFraction=0.632,
        seed = seedVal).setOutputMode('REGRESSION')
    
    # define the Train classifier
    trainedClassifer = classifier.train(trainFC, varToModel, predictorVector)
    # Apply classifier to the feature collection
    classified = testFC.classify(classifier = trainedClassifer,
                                 outputName = 'predicted')
    # Get predicted value
    predicted = classified.first().get('predicted')
    # return the predicted value for each feature
    return f.set('predicted', predicted).copyProperties(f)

In [6]:
# Define the R^2 claculation function
def coefficientOfDetermination(fcOI,propertyOfInterest,propertyOfInterest_Predicted):
    # Compute the mean of the property of interest
    propertyOfInterestMean = ee.Number(ee.Dictionary(ee.FeatureCollection(fcOI).select([propertyOfInterest]).reduceColumns(ee.Reducer.mean(),[propertyOfInterest])).get('mean'));
    # Compute the total sum of squares
    def totalSoSFunction(f):
        return f.set('Difference_Squared',ee.Number(ee.Feature(f).get(propertyOfInterest)).subtract(propertyOfInterestMean).pow(ee.Number(2)))
    totalSumOfSquares = ee.Number(ee.Dictionary(ee.FeatureCollection(fcOI).map(totalSoSFunction).select(['Difference_Squared']).reduceColumns(ee.Reducer.sum(),['Difference_Squared'])).get('sum'))
    # Compute the residual sum of squares
    def residualSoSFunction(f):
        return f.set('Residual_Squared',ee.Number(ee.Feature(f).get(propertyOfInterest)).subtract(ee.Number(ee.Feature(f).get(propertyOfInterest_Predicted))).pow(ee.Number(2)))
    residualSumOfSquares = ee.Number(ee.Dictionary(ee.FeatureCollection(fcOI).map(residualSoSFunction).select(['Residual_Squared']).reduceColumns(ee.Reducer.sum(),['Residual_Squared'])).get('sum'))
    # Finalize the calculation
    r2 = ee.Number(1).subtract(residualSumOfSquares.divide(totalSumOfSquares))
    return ee.Number(r2)

In [7]:
# define the R2 calc function 
def calc_final_r2(buffer_feat):
    rep = buffer_feat.get('rep')
    # Add buffer to FC of sampled observations
    buffer = buffer_feat.get('buffer_size')
    
    # Sample 1000 validation points from the data
    subsetData = perBootstrapTable.randomColumn(seed = rep).sort('random').limit(n_points)

    fc_wBuffer = subsetData.map(lambda f: f.buffer(buffer))
    fc_toValidate = fc_wBuffer.map(lambda f: f.set('rep', rep))
    # Apply blocked leave one out CV function
    predicted = fc_toValidate.map(BLOOcv)
    # Calculate R2 value
    R2_val = coefficientOfDetermination(predicted, varToModel, 'predicted')
    return(buffer_feat.set('R2_val', R2_val))

In [8]:
# define the PredObs calc function 
def calc_Pred_Obs(buffer_feat):
    rep = buffer_feat.get('rep')
    # Add buffer to FC of sampled observations
    buffer = buffer_feat.get('buffer_size')
    
    # Sample 1000 validation points from the data
    subsetData = perBootstrapTable.randomColumn(seed = rep).sort('random').limit(n_points)

    # fc_wBuffer = subsetData.map(lambda f: f.buffer(buffer))
    fc_toValidate = subsetData.map(lambda f: f.set('rep', rep))
    # Apply blocked leave one out CV function
    predicted = fc_toValidate.map(BLOOcv)
    # Uncomment the lines below to export the predicted/observed data per buffer size
    predObs = predicted.select([varToModel, 'predicted']).map(lambda f: f.set('rep', rep))
    return(predObs)

### 3.2 calculate the spatial CV R2

In [9]:
# generate a ee.List to save the seeds
seedList = np.arange(0, 100, 1).tolist()
print(colored('The seeds are:', 'blue', attrs=['bold']),seedList)
print(colored('Model is running!', 'blue', attrs=['bold']))
for seed in seedList:
    n_reps = 2
    nList = list(range(0,n_reps))
    n_points = 1000
    #  define a feature collection to save the calcuation results
    bloo_cv_fc = ee.FeatureCollection(ee.List(nList).map(lambda n: ee.Feature(ee.Geometry.Point([0,0])).set('buffer_size',buffer_sizes).set('rep',n)))
    # load the train table
    perBootstrapTable = ee.FeatureCollection('users/nordmannmoore/ForestBiomass/SpawnMap/TrainTables/HM2_Grid_subsampled_Natural_Train_Table_seed_'+str(seed))
    # print(trainTable.size().getInfo())
    parameterTable = pd.read_csv('Data/SatelliteDerivedModel/GridSearchResult/HM2_Grid_subsampled_Natural_Potential_Biomass_Modeling_Grid_Search_Seed_'+str(seed)+'.csv', float_precision='round_trip')
    # extract the paramters
    variablesPerSplitVal = int(parameterTable['variablesPerSplit'].iat[0]) # mtry
    minLeafPopulationVal = int(parameterTable['minLeafPopulation'].iat[0]) # minrow
    maxNodesVal = int(parameterTable['maxNodes'].iat[0]) # mac depth
    seedVal = seed
    # Calculate predObs across range of R2 values
    final_fc = bloo_cv_fc.map(calc_final_r2)
    #     rSquared_export = ee.batch.Export.table.toAsset(
    #         collection = final_fc,
    #         description = varToModel+'bloo_cv_RM_rSqured_'+str(buffer_sizes)+'m_'+str(seed),
    #         assetId = 'users/leonidmoore/ForestBiomass/RemoteSensingModel/SpatialCrossValidation/Remote_Sensing_Spatial_CV_Rsquared_'+str(buffer_sizes)+'m_'+str(seed))
    #     # start to the running
    #rSquared_export.start()
    predObs_10_Folds_CV_Export = ee.batch.Export.table.toCloudStorage(
        collection = final_fc,
        description = 'HM2_Leav_One_Out_Cross_Validation_rSquared_'+str(seed),
        bucket = "crowtherlab_gcsb_lidong",
        fileNamePrefix = 'LOOCV_Results/Model_HM2_Leav_One_Out_Cross_Validation_rSquared_'+str(seed),
        fileFormat ='CSV')
    
    predObs_10_Folds_CV_Export.start()
    predObs_10_Folds_CV_Export.status()
    
    

[1m[34mThe seeds are:[0m [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
[1m[34mModel is running![0m


### 3.3 calculate the spatial CV predicted VS observed

In [34]:
# # generate a ee.List to save the seeds
# seedList = np.arange(0, 1, 1).tolist()
# print(colored('The seeds are:', 'blue', attrs=['bold']),seedList)
# print(colored('Model is running!', 'blue', attrs=['bold']))
# for seed in seedList:
#     n_reps = 5
#     nList = list(range(0,n_reps))
#     n_points = 1000
#     #  define a feature collection to save the calcuation results
#     bloo_cv_fc = ee.FeatureCollection(ee.List(nList).map(lambda n: ee.Feature(ee.Geometry.Point([0,0])).set('buffer_size',buffer_sizes).set('rep',n)))
#     # read the train table for each dataset
#     perBootstrapTable = ee.FeatureCollection('users/leonidmoore/ForestBiomass/RemoteSensingModel/TrainTables/Remote_Sensing_Random_Subsampled_Train_Table_seed_'+str(seed))
#     #load the parameter table
#     parameterTable = pd.read_csv('RemoteSensingModel/GridSearchResult/Remote_Sensing_Biomass_Modeling_Grid_Search_Seed_'+str(seed)+'.csv', float_precision='round_trip')
#     # extract the paramters
#     variablesPerSplitVal = int(parameterTable['variablesPerSplit'].iat[0]) # mtry
#     minLeafPopulationVal = int(parameterTable['minLeafPopulation'].iat[0]) # minrow
#     maxNodesVal = int(parameterTable['maxNodes'].iat[0]) # mac depth
#     seedVal = seed
#     # Calculate predObs across range of R2 values
#     final_PredObs = bloo_cv_fc.map(calc_Pred_Obs)
#     # flatten the featureCollection of featureCollection for easy writing to google earth engine asset
#     filteredData = final_PredObs.flatten()
#     # define the exportation code
#     predObs_export = ee.batch.Export.table.toAsset(
#         collection = filteredData,
#         description = varToModel+'_SD1_Spatial_Cross_Validation_PredObs_'+str(buffer_sizes)+'m_'+str(seed),
#     assetId = 'users/leonidmoore/ForestBiomass/RemoteSensingModel/SpatialCrossValidation/Remote_Sensing_SD1_Spatial_CV_PredObs_'+str(buffer_sizes)+'m_'+str(seed))
#     # start the exportation
#     predObs_export.start()
# #     leave_OneOut_CV_Export = ee.batch.Export.table.toDrive(
# #         collection = filteredData,
# #         description = 'Leav_One_Out_Cross_Validation_rSquared_'+str(seed),
# #         fileNamePrefix = 'Remote_Sensing_Leav_One_Out_Cross_Validation_rSquared_'+str(seed)+'.csv',
# #         fileFormat ='CSV')
# #     leave_OneOut_CV_Export.start()
    

[1m[34mThe seeds are:[0m [0]
[1m[34mModel is running![0m
