# Model 1: Low resolution Detection of flooded tiles

The output of this model is used to train model 2

In [None]:
import numpy as np
import xarray as xr
import src.baseline_model01 as bm
import os

# Pre processing

The constructor handle a small amount of data processing :

* Pre-processing of the static data 
* Construction of the labels from raw data
* Loading of the dynamic data

And the splitting of the dataset into test / train / validation set follonwing defined start date and end date. The validation set could be useful as an alternative for Cross Validation to find optimal hyper-parameters.

In [None]:
baseline_model_generator = bm.BaseLineModel(
    "localdata/final_label_Full_ERA5.nc",
    dynamic_features_path = "localdata/raw/ERA5_train.nc",
    static_features_path = "localdata/static_ERA5.nc",
    train_start = "2002-08-03", # date where to split train test
    train_end = "2003-01-01", # date where to split train test
    test_start = "2003-01-01", # date where to split train test
    test_end = "2003-03-17", # date where to split train test
    name = "Model_01_default",
    seed=0
)

# Training

The baseline model used here is a random forest (Random forest is used as an exemple of a basic ML approach sometime used for flood prediction [<a id="1" href="https://www.tandfonline.com/doi/full/10.1080/19475705.2017.1308971">1</a>,<a id="2" href="https://www.sciencedirect.com/science/article/abs/pii/S0022169415004217">2</a>,<a id="3" href="https://www.sciencedirect.com/science/article/abs/pii/S004896971934971X">3</a>]), it is by no way the only approach for such endeavor.

To train the model the following parameters must be defined :

* selected static features (geospatial data)
* selected dynamic features (climate data):
   * Depth of history to process climate features
   * Usage of climate feature derivative  


## References
<a id="1" href="https://www.tandfonline.com/doi/full/10.1080/19475705.2017.1308971">[1]</a> 
Lee, S., Kim, J. C., Jung, H. S., Lee, M. J., & Lee, S. (2017). Spatial prediction of flood susceptibility using random-forest and boosted-tree models in Seoul metropolitan city, Korea. Geomatics, Natural Hazards and Risk, 8(2), 1185-1203.

<a id="2" href="https://www.sciencedirect.com/science/article/abs/pii/S0022169415004217">[2]</a>
Wang, Z., Lai, C., Chen, X., Yang, B., Zhao, S., & Bai, X. (2015). Flood hazard risk assessment model based on random forest. Journal of Hydrology, 527, 1130-1141.

<a id="3" href="https://www.sciencedirect.com/science/article/abs/pii/S004896971934971X">[3]</a>
Chen, W., Li, Y., Xue, W., Shahabi, H., Li, S., Hong, H., ... & Ahmad, B. B. (2020). Modeling flood susceptibility using data-driven approaches of naïve bayes tree, alternating decision tree, and random forest methods. Science of The Total Environment, 701, 134979.


In [None]:
# The training parameters bellow are the best one we found during the genetic algorithm optimisation (see 'model optimisation' section):

#model, acc = baseline_model_generator.load_indiv([True, True, 1, True, True, 0, True, True, True, True, 1, False, True, True, False, True, 0, 289, 10, 37, 14], False)
model, acc = baseline_model_generator.train_model([False, # 'soilgrid_bdod'
                                                  True, # 'soilgrid_cfvo'
                                                  False, # 'soilgrid_silt'
                                                  True, # 'soilgrid_clay'
                                                  False, # 'soilgrid_sand' 
                                                  False, # 'depth_to_bedrock'
                                                  True, # 'altitude'
                                                  False, #  'aspect'
                                                  True, # 'slope'
                                                  False, # 'water_density'
                                                  True, # 'watershed'
                                                  False, # 'topological_catchment_areas'
                                                  True, # 'dist_sea'
                                                  True, # 'dist_riv'
                                                  True, # 'tp'
                                                  True, # 't2m'
                                                  False, # 'use deriv'
                                                  195, # nb. Trees
                                                  8, # Trees dept
                                                  83, # Hist dept 1
                                                  11 # Hist dept 2
                                                  ], False)

#### Saving the model

This method save the BaselineModel object, by default the *name* atribute is used as the file name (if you save a new model with the same name it will replace the previous one).

In [None]:
baseline_model_generator.save_to_disk(name="Model_01_default")

#if you want to load a previously trained model :
#baseline_model_generator = baseline_model_generator.load_from_disk(name="Model_01_default")

### Model optimisation

We chose here to look for good hyper-parameters using a Genetic Algorithm (GA), exemple of alternatives to GA for hyper parameters optimisation includes : random search, grid search...

To use the GA opimiser method you need to define a population size and a number of generation. 

In [None]:
#baseline_model_generator.GA_optimisation(ngen = 40, pop = 80, best_individuals = [])

# Model Analysis

We propose a few helper methods for model analysis, such as :

* Feature importance
* Graph of metrics such as ROC, AP
* Results mapping


#### Feature importance method

In [None]:
baseline_model_generator.compute_all_metrics()

In [None]:
baseline_model_generator.print_feature_importance()

#### ROC curve

auc_graph accept a data set name (train / test / val), a metric by default it will output results for a

In [None]:
baseline_model_generator.auc_graph(dataset=["Train","Test"],metrics="", key_thresholds=[0.01,0.1,0.15, 0.2,0.3, 0.5, 0.9, 0.95])

#### Prediction map

In [None]:
baseline_model_generator.save_prediction_map(save_path = "graph/model1/predictions/")

#### Differences between labels and predictions

In [None]:
baseline_model_generator.save_prediction_map_and_labels(save_path =  "graph/model1/label_and_pred/")

#### Classification map at all threshold

In [None]:
baseline_model_generator.save_FP_FN_map(save_path = "graph/model1/FP_FN/", thresholds = [0.2, 0.3, 0.5, 0.7])

#### Error Map

In [None]:
baseline_model_generator.save_error_map(save_path="graph/model1/error_map/")

# Data computation for the second model

We choose to use the M1 model score (M1_score) as a dynamic feature for the M2 model. 
The following part create two Xarrays with M1 model score at Full Resolution and ERA5 resolution.


## Training Data
#### Creation of the M1 score Xarray at ERA5 resolution

In [None]:
baseline_model_generator.compute_full_grid()
Full_Rez = xr.open_dataset("localdata/final_label_Full_Rez.nc")

xr_array_score = xr.DataArray(baseline_model_generator.full_grid_all, 
                              dims=["time", "y", "x"],
                              coords={"time": Full_Rez.time, 
                                      "x": baseline_model_generator.labels.x, 
                                      "y": baseline_model_generator.labels.y},
                              name="M1_score")

xr_array_score = xr_array_score.astype('float32')
os.remove('localdata/Model1_score_ERA5_Rez_v2.nc')
xr_array_score.to_netcdf('localdata/Model1_score_ERA5_Rez_v2.nc', engine='h5netcdf')

In [None]:
xr_array_score = xr.open_dataset('localdata/Model1_score_ERA5_Rez_v2.nc')

#### Creation of the M1 score Xarray at Full Resolution

In [None]:
Full_Rez = xr.open_dataset("localdata/final_label_Full_Rez.nc")
small_interp = xr_array_score['M1_score'].interp(x=Full_Rez.x, y=Full_Rez.y, method='nearest')
Full_Rez = Full_Rez.rename({'__xarray_dataarray_variable__': 'M1_score'})

expanded_score = Full_Rez.copy()
expanded_score['M1_score'] = small_interp

expanded_score['M1_score'] = expanded_score['M1_score'].astype('float32')

fill_value = small_interp.mean().item()
expanded_score['M1_score'] = expanded_score['M1_score'].fillna(fill_value)
os.remove('localdata/Model1_Score_Full_Rez_v2.nc')
expanded_score.to_netcdf('localdata/Model1_Score_Full_Rez_v2.nc', engine='h5netcdf')

#### Sanity check
M1_score should be a float between O and 1. The presence of Nan will make the next model crash.

In [None]:
Model1_score_ERA5_Rez_v2 = xr.open_dataset('localdata/Model1_score_ERA5_Rez_v2.nc')
np.unique(Model1_score_ERA5_Rez_v2.M1_score.values)


In [None]:
Model1_Score_Full_Rez_v2 = xr.open_dataset('localdata/Model1_Score_Full_Rez_v2.nc')
np.unique(Model1_Score_Full_Rez_v2.M1_score.values)


## Inference / Evaluation Data
#### Creation of the M1 score Xarray at ERA5 resolution

In [None]:
time_slice = baseline_model_generator.labels.sel(time=slice('2003-11-01T00:00:00.000000000','2004-01-01T00:00:00.000000000')).time.values

In [None]:
Full_Rez = xr.open_dataset("localdata/final_label_Full_Rez.nc")
xr_array_score = xr.DataArray(baseline_model_generator.full_grid_inf, 
                              dims=["time", "y", "x"],
                              coords={"time": time_slice, 
                                      "x": baseline_model_generator.labels.x, 
                                      "y": baseline_model_generator.labels.y},
                              name="M1_score")

xr_array_score = xr_array_score.astype('float32')
os.remove('localdata/Model1_score_ERA5_Rez_inf.nc')
xr_array_score.to_netcdf('localdata/Model1_score_ERA5_Rez_inf.nc', engine='h5netcdf')

In [None]:
xr_ERA5_Rez_inf = xr.open_dataset('localdata/Model1_score_ERA5_Rez_inf.nc')

#### Creation of the M1 score Xarray at Full Resolution

In [None]:
small_interp = xr_ERA5_Rez_inf['M1_score'].interp(x=Full_Rez.x, y=Full_Rez.y, method='nearest')

fill_value = small_interp.mean().item()
small_interp = small_interp.fillna(fill_value)

xr_array_score = xr.DataArray(small_interp, 
                              dims=["time", "y", "x"],
                              coords={"time": time_slice, 
                                      "x": Full_Rez.x, 
                                      "y": Full_Rez.y},
                              name="M1_score")

xr_array_score = xr_array_score.astype('float32')
os.remove('localdata/Model1_Score_Full_Rez_inf.nc')
xr_array_score.to_netcdf('localdata/Model1_Score_Full_Rez_inf.nc', engine='h5netcdf')

#### Sanity check

In [None]:
Model1_score_ERA5_Rez_inf = xr.open_dataset('localdata/Model1_score_ERA5_Rez_inf.nc')
np.unique(Model1_score_ERA5_Rez_inf.M1_score.values)

In [None]:
Model1_Score_Full_Rez_inf = xr.open_dataset('localdata/Model1_Score_Full_Rez_inf.nc')
np.unique(Model1_Score_Full_Rez_inf.M1_score.values)